In [1]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from math import sqrt
from numpy import mean
from matplotlib import pyplot
import matplotlib.pyplot as plt
from datetime import datetime
In [2]:
import warnings
warnings.filterwarnings('ignore')

Loading the data

In [3]:
consumption = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Privatforbrug%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','consumption'])
consumption['Time']=pd.period_range('1990-01', periods=124, freq='Q')
consumption['consumption'] = pd.to_numeric(consumption['consumption'])
consumption=consumption.set_index('Time')
consumption.index = consumption.index.to_timestamp()

exports = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Eksport%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','exports'])
exports['Time']=pd.period_range('1990-01', periods=124, freq='Q')
exports['exports'] = pd.to_numeric(exports['exports'])
exports=exports.set_index('Time')
exports.index = exports.index.to_timestamp()

imports = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Import%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','imports'])
imports['Time']=pd.period_range('1990-01', periods=124, freq='Q')
imports['imports'] = pd.to_numeric(imports['imports'])
imports=imports.set_index('Time')
imports.index = imports.index.to_timestamp()

investments = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Investment1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','investments'])
investments['Time']=pd.period_range('1990-01', periods=124, freq='Q')
investments['investments'] = pd.to_numeric(investments['investments'])
investments=investments.set_index('Time')
investments.index = investments.index.to_timestamp()

government_expenditure = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Public%20consumption1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','government_expenditure'])
government_expenditure['Time']=pd.period_range('1990-01', periods=124, freq='Q')
government_expenditure['government_expenditure'] = pd.to_numeric(government_expenditure['government_expenditure'])
government_expenditure=government_expenditure.set_index('Time')
government_expenditure.index = government_expenditure.index.to_timestamp()

gdp = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/GDP1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','gdp'])
gdp['Time']=pd.period_range('1990-01', periods=124, freq='Q')
gdp['gdp'] = pd.to_numeric(gdp['gdp'])
gdp=gdp.set_index('Time')
gdp.index = gdp.index.to_timestamp()

Initial data analysis

In [4]:
plt.rcParams["figure.figsize"] = (20,3)
%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns
sns.set(rc={'figure.figsize':(15, 12)})
In [5]:
fig, axs = plt.subplots(3, 2, sharex=False, sharey=False, figsize=(18,12))
axs[0, 0].plot(gdp.gdp)
axs[0, 0].set_title('GDP')
plt.setp(axs[0, 0], ylabel='Bill. DKK')
axs[0, 1].plot(consumption.consumption, 'tab:blue')
axs[0, 1].set_title('Consumption')
plt.setp(axs[1, 0], ylabel='Bill. DKK')
axs[1, 0].plot(imports.imports)
axs[1, 0].set_title('Imports')
plt.setp(axs[2, 0], ylabel='Bill. DKK')
axs[1, 1].plot(exports.exports)
axs[1, 1].set_title('Exports')
plt.setp(axs[2, 1], ylabel='Bill. DKK')
axs[2, 0].plot(investments.investments)
axs[2, 0].set_title('Investments')
plt.setp(axs[0, 1], ylabel='Bill. DKK')
axs[2, 1].plot(government_expenditure.government_expenditure)
axs[2, 1].set_title('Government expenditure')
plt.setp(axs[1, 1], ylabel='Bill. DKK')
Out[5]:
[Text(0, 0.5, 'Bill. DKK')]

Splitting the data set

In [6]:
nobs = 6
In [7]:
imports_train, imports_test =imports[0:-nobs], imports[-nobs:]
exports_train, exports_test =exports[0:-nobs], exports[-nobs:]
investments_train, investments_test =investments[0:-nobs], investments[-nobs:]
government_expenditure_train, government_expenditure_test =government_expenditure[0:-nobs], government_expenditure[-nobs:]
gdp_train, gdp_test =gdp[0:-nobs], gdp[-nobs:]
consumption_train, consumption_test =consumption[0:-nobs], consumption[-nobs:]


# Check size
print(consumption_train.shape)  # (118, 1)
print(consumption_test.shape)  # (6, 1)
(118, 1)
(6, 1)

Baseline model

Baseline for Consumption

In [8]:
X = consumption.values
train, test = X[0:-6], X[-6:]

# walk-forward validation for test predictions
history = [x for x in train]
predictions_test = list()
for i in range(len(test)):
	# make prediction
	predictions_test.append(history[-4])
	# observation
	history.append(test[i])

#Converting to dataframe to be able to concatenate
predictions_test = pd. DataFrame(predictions_test) 

# walk-forward validation for train predictions
predictions_train = list()
for i in range(len(train)):
	# make prediction
	predictions_train.append(history[-4])
	# observation
	history.append(train[i])
 
#Converting to dataframe and replacing first 4 observations with NAs
predictions_train = pd.DataFrame(predictions_train)
predictions_train[0][:4] = np.NaN 

#Concatenating predictions
predictions_total = pd.concat([predictions_train, predictions_test], ignore_index=True)

#Performance measures
forecast_values_consumption_baseline = predictions_test

rmse_test = sqrt(mean_squared_error(test, predictions_test))
rmse_train = sqrt(mean_squared_error(train[4:], predictions_train[4:]))

print('RMSE Train: %.3f' % rmse_train)
print('RMSE Test: %.3f' % rmse_test)

predictions = predictions_total[4:]
observations = pd.DataFrame(X[4:])
predictions['Time']=pd.period_range('1991-01', periods=120, freq='Q')
predictions=predictions.set_index('Time')
predictions.index = predictions.index.to_timestamp()
observations['Time']=pd.period_range('1991-01', periods=120, freq='Q')
observations=observations.set_index('Time')
observations.index = observations.index.to_timestamp()

# line plot of observed vs predicted
plt.figure(figsize=(15, 7.5))
plt.axvspan(consumption.index[-7], consumption.index[-1], alpha=0.5, color='lightgrey')
plt.plot(observations, color='r', label='actual')
plt.plot(predictions, label='prediction')
plt.legend()
plt.show()
RMSE Train: 6.754
RMSE Test: 8.553

Baseline for Imports

In [9]:
X = imports.values
train, test = X[0:-6], X[-6:]

# walk-forward validation for test predictions
history = [x for x in train]
predictions_test = list()
for i in range(len(test)):
	# make prediction
	predictions_test.append(history[-4])
	# observation
	history.append(test[i])

#Converting to dataframe to be able to concatenate
predictions_test = pd. DataFrame(predictions_test) 

# walk-forward validation for train predictions
predictions_train = list()
for i in range(len(train)):
	# make prediction
	predictions_train.append(history[-4])
	# observation
	history.append(train[i])
 
#Converting to dataframe and replacing first 4 observations with NAs
predictions_train = pd.DataFrame(predictions_train)
predictions_train[0][:4] = np.NaN 

#Concatenating predictions
predictions_total = pd.concat([predictions_train, predictions_test], ignore_index=True)

# report performance
forecast_values_imports_baseline = predictions_test

rmse_test = sqrt(mean_squared_error(test, predictions_test))
rmse_train = sqrt(mean_squared_error(train[4:], predictions_train[4:]))

print('RMSE Train: %.3f' % rmse_train)
print('RMSE Test: %.3f' % rmse_test)

predictions = predictions_total[4:]
observations = pd.DataFrame(X[4:])
predictions['Time']=pd.period_range('1991-01', periods=120, freq='Q')
predictions=predictions.set_index('Time')
predictions.index = predictions.index.to_timestamp()
observations['Time']=pd.period_range('1991-01', periods=120, freq='Q')
observations=observations.set_index('Time')
observations.index = observations.index.to_timestamp()

# line plot of observed vs predicted
plt.figure(figsize=(15, 7.5))
plt.axvspan(imports.index[-7], imports.index[-1], alpha=0.5, color='lightgrey')
plt.plot(observations, color='r', label='actual')
plt.plot(predictions, label='prediction')
plt.legend()
plt.show()
RMSE Train: 16.053
RMSE Test: 21.465

Baseline for Export

In [10]:
X = exports.values
train, test = X[0:-6], X[-6:]

# walk-forward validation for test predictions
history = [x for x in train]
predictions_test = list()
for i in range(len(test)):
	# make prediction
	predictions_test.append(history[-4])
	# observation
	history.append(test[i])

#Converting to dataframe to be able to concatenate
predictions_test = pd. DataFrame(predictions_test) 

# walk-forward validation for train predictions
predictions_train = list()
for i in range(len(train)):
	# make prediction
	predictions_train.append(history[-4])
	# observation
	history.append(train[i])
 
#Converting to dataframe and replacing first 4 observations with NAs
predictions_train = pd.DataFrame(predictions_train)
predictions_train[0][:4] = np.NaN 

#Concatenating predictions
predictions_total = pd.concat([predictions_train, predictions_test], ignore_index=True)

# report performance
forecast_values_exports_baseline = predictions_test

rmse_test = sqrt(mean_squared_error(test, predictions_test))
rmse_train = sqrt(mean_squared_error(train[4:], predictions_train[4:]))

print('RMSE Train: %.3f' % rmse_train)
print('RMSE Test: %.3f' % rmse_test)

predictions = predictions_total[4:]
observations = pd.DataFrame(X[4:])
predictions['Time']=pd.period_range('1991-01', periods=120, freq='Q')
predictions=predictions.set_index('Time')
predictions.index = predictions.index.to_timestamp()
observations['Time']=pd.period_range('1991-01', periods=120, freq='Q')
observations=observations.set_index('Time')
observations.index = observations.index.to_timestamp()

# line plot of observed vs predicted
plt.figure(figsize=(15, 7.5))
plt.axvspan(exports.index[-7], exports.index[-1], alpha=0.5, color='lightgrey')
plt.plot(observations, color='r', label='actual')
plt.plot(predictions, label='prediction')
plt.legend()
plt.show()
RMSE Train: 17.024
RMSE Test: 28.690

Baseline for Investment

In [11]:
X = investments.values
train, test = X[0:-6], X[-6:]

# walk-forward validation for test predictions
history = [x for x in train]
predictions_test = list()
for i in range(len(test)):
	# make prediction
	predictions_test.append(history[-4])
	# observation
	history.append(test[i])

#Converting to dataframe to be able to concatenate
predictions_test = pd. DataFrame(predictions_test) 

# walk-forward validation for train predictions
predictions_train = list()
for i in range(len(train)):
	# make prediction
	predictions_train.append(history[-4])
	# observation
	history.append(train[i])
 
#Converting to dataframe and replacing first 4 observations with NAs
predictions_train = pd.DataFrame(predictions_train)
predictions_train[0][:4] = np.NaN 

#Concatenating predictions
predictions_total = pd.concat([predictions_train, predictions_test], ignore_index=True)

# report performance
forecast_values_investments_baseline = predictions_test

rmse_test = sqrt(mean_squared_error(test, predictions_test))
rmse_train = sqrt(mean_squared_error(train[4:], predictions_train[4:]))

print('RMSE Train: %.3f' % rmse_train)
print('RMSE Test: %.3f' % rmse_test)

predictions = predictions_total[4:]
observations = pd.DataFrame(X[4:])
predictions['Time']=pd.period_range('1991-01', periods=120, freq='Q')
predictions=predictions.set_index('Time')
predictions.index = predictions.index.to_timestamp()
observations['Time']=pd.period_range('1991-01', periods=120, freq='Q')
observations=observations.set_index('Time')
observations.index = observations.index.to_timestamp()

# line plot of observed vs predicted
plt.figure(figsize=(15, 7.5))
plt.axvspan(investments.index[-7], investments.index[-1], alpha=0.5, color='lightgrey')
plt.plot(observations, color='r', label='actual')
plt.plot(predictions, label='prediction')
plt.legend()
plt.show()
RMSE Train: 8.627
RMSE Test: 5.803

Baseline for Goverment Expenditures

In [12]:
X = government_expenditure.values
train, test = X[0:-6], X[-6:]

# walk-forward validation for test predictions
history = [x for x in train]
predictions_test = list()
for i in range(len(test)):
	# make prediction
	predictions_test.append(history[-4])
	# observation
	history.append(test[i])

#Converting to dataframe to be able to concatenate
predictions_test = pd. DataFrame(predictions_test) 

# walk-forward validation for train predictions
predictions_train = list()
for i in range(len(train)):
	# make prediction
	predictions_train.append(history[-4])
	# observation
	history.append(train[i])
 
#Converting to dataframe and replacing first 4 observations with NAs
predictions_train = pd.DataFrame(predictions_train)
predictions_train[0][:4] = np.NaN 

#Concatenating predictions
predictions_total = pd.concat([predictions_train, predictions_test], ignore_index=True)

# report performance
forecast_values_gov_baseline = predictions_test

rmse_test = sqrt(mean_squared_error(test, predictions_test))
rmse_train = sqrt(mean_squared_error(train[4:], predictions_train[4:]))

print('RMSE Train: %.3f' % rmse_train)
print('RMSE Test: %.3f' % rmse_test)

predictions = predictions_total[4:]
observations = pd.DataFrame(X[4:])
predictions['Time']=pd.period_range('1991-01', periods=120, freq='Q')
predictions=predictions.set_index('Time')
predictions.index = predictions.index.to_timestamp()
observations['Time']=pd.period_range('1991-01', periods=120, freq='Q')
observations=observations.set_index('Time')
observations.index = observations.index.to_timestamp()

# line plot of observed vs predicted
plt.figure(figsize=(15, 7.5))
plt.axvspan(government_expenditure.index[-7], government_expenditure.index[-1], alpha=0.5, color='lightgrey')
plt.plot(observations, color='r', label='actual')
plt.plot(predictions, label='prediction')
plt.legend()
plt.show()
RMSE Train: 3.600
RMSE Test: 4.934

Baseline for GDP

In [13]:
X = gdp.values
train, test = X[0:-6], X[-6:]

# walk-forward validation for test predictions
history = [x for x in train]
predictions_test = list()
for i in range(len(test)):
	# make prediction
	predictions_test.append(history[-4])
	# observation
	history.append(test[i])

#Converting to dataframe to be able to concatenate
predictions_test = pd. DataFrame(predictions_test) 

# walk-forward validation for train predictions
predictions_train = list()
for i in range(len(train)):
	# make prediction
	predictions_train.append(history[-4])
	# observation
	history.append(train[i])
 
#Converting to dataframe and replacing first 4 observations with NAs
predictions_train = pd.DataFrame(predictions_train)
predictions_train[0][:4] = np.NaN 

#Concatenating predictions
predictions_total = pd.concat([predictions_train, predictions_test], ignore_index=True)

# report performance
forecast_values_gdp_baseline = predictions_test

rmse_test = sqrt(mean_squared_error(test, predictions_test))
rmse_train = sqrt(mean_squared_error(train[4:], predictions_train[4:]))

print('RMSE Train: %.3f' % rmse_train)
print('RMSE Test: %.3f' % rmse_test)

predictions = predictions_total[4:]
observations = pd.DataFrame(X[4:])
predictions['Time']=pd.period_range('1991-01', periods=120, freq='Q')
predictions=predictions.set_index('Time')
predictions.index = predictions.index.to_timestamp()
observations['Time']=pd.period_range('1991-01', periods=120, freq='Q')
observations=observations.set_index('Time')
observations.index = observations.index.to_timestamp()

# line plot of observed vs predicted
plt.figure(figsize=(15, 7.5))
plt.axvspan(government_expenditure.index[-7], government_expenditure.index[-1], alpha=0.5, color='lightgrey')
plt.plot(observations, color='r', label='actual')
plt.plot(predictions, label='prediction')
plt.legend()
plt.show()
RMSE Train: 15.711
RMSE Test: 18.707

Econometrics

In [14]:
!pip install pmdarima
Requirement already satisfied: pmdarima in /usr/local/lib/python3.7/dist-packages (1.8.2)
Requirement already satisfied: Cython!=0.29.18,>=0.29 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.29.23)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.4.1)
Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.22.2.post1)
Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.1.5)
Requirement already satisfied: numpy~=1.19.0 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.19.5)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.24.3)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.0.1)
Requirement already satisfied: statsmodels!=0.12.0,>=0.11 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.12.2)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (56.1.0)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->pmdarima) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->pmdarima) (2018.9)
Requirement already satisfied: patsy>=0.5 in /usr/local/lib/python3.7/dist-packages (from statsmodels!=0.12.0,>=0.11->pmdarima) (0.5.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=0.19->pmdarima) (1.15.0)

Decomposition

In [15]:
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse
import pandas.util.testing as tm

Decomposition for Consumption

In [16]:
decomposition = seasonal_decompose(consumption_train.consumption, model='additive')
plt.rcParams.update({'figure.figsize': (10,10)})
decomposition.plot();

Decomposition for Imports

In [17]:
decomposition = seasonal_decompose(imports_train['imports'], model='additive')
plt.rcParams.update({'figure.figsize': (10,10)})
decomposition.plot();

Decomposition for Exports

In [18]:
decomposition = seasonal_decompose(exports_train['exports'], model='additive')
plt.rcParams.update({'figure.figsize': (10,10)})
decomposition.plot();

Decomposition for Investments

In [19]:
decomposition = seasonal_decompose(investments_train['investments'], model='additive')
plt.rcParams.update({'figure.figsize': (10,10)})
decomposition.plot();

Decomposition for Government Expenditures

In [20]:
decomposition = seasonal_decompose(government_expenditure_train['government_expenditure'], model='additive')
plt.rcParams.update({'figure.figsize': (10,10)})
decomposition.plot();

ACF plots of the original series

ACF for Consumption

In [21]:
import statsmodels.api as sm
plt.rcParams["figure.figsize"] = (8,6)
sm.graphics.tsa.plot_acf(consumption_train.consumption, lags=40, alpha = 0.05, title='ACF of Consumption');

ACF for Imports, Exports, Investments and Government Expenditures

In [22]:
fig, ax=plt.subplots(2,2, figsize=(18,10))
fig.suptitle('Autocorrelation of original series')
sm.graphics.tsa.plot_acf(imports_train.imports, lags=40, alpha = 0.05, ax=ax[0,0], title='ACF of Imports')
sm.graphics.tsa.plot_acf(exports_train.exports, lags=40, alpha = 0.05, ax=ax[0,1], title='ACF of Exports')
sm.graphics.tsa.plot_acf(investments_train.investments, lags=40, alpha = 0.05, ax=ax[1,0], title='ACF of Investments')
sm.graphics.tsa.plot_acf(government_expenditure_train.government_expenditure, lags=40, alpha = 0.05, ax=ax[1,1], title='ACF of Government Expenditure');

Stationarity test of the original series

ADF test

In [23]:
from statsmodels.tsa.stattools import adfuller

def adfuller_test(series, signif=0.05, name='', verbose=False):

    r = adfuller(series, autolag='AIC', regression = 'ct') 
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    # Print Summary
    print(f' p-value               = {p_value}')
    if p_value <= signif:
        print(f" Rejecting Null Hypothesis => Series is Stationary.")
    else:
        print(f" Weak evidence to reject the Null Hypothesis => Series is non-stationary.")
In [24]:
print(f'ADF for original series')
print(f'Consumption:')
print(adfuller_test(consumption_train.consumption))
print(f'Imports:')
print(adfuller_test(imports_train.imports))
print(f'Exports:')
print(adfuller_test(exports_train.exports))
print(f'Investments:')
print(adfuller_test(investments_train.investments))
print(f'Government Expenditures:')
print(adfuller_test(government_expenditure_train.government_expenditure))
ADF for original series
Consumption:
 p-value               = 0.1126
 Weak evidence to reject the Null Hypothesis => Series is non-stationary.
None
Imports:
 p-value               = 0.0813
 Weak evidence to reject the Null Hypothesis => Series is non-stationary.
None
Exports:
 p-value               = 0.1283
 Weak evidence to reject the Null Hypothesis => Series is non-stationary.
None
Investments:
 p-value               = 0.3858
 Weak evidence to reject the Null Hypothesis => Series is non-stationary.
None
Government Expenditures:
 p-value               = 0.7858
 Weak evidence to reject the Null Hypothesis => Series is non-stationary.
None

KPSS test

In [25]:
from statsmodels.tsa.stattools import kpss
def kpss_test(series, **kw):    
    statistic, p_value, n_lags, critical_values = kpss(series, nlags='auto', **kw)
    def adjust(val, length= 6): return str(val).ljust(length)

    print(f'p-value               = {p_value}')
    print(f'The series is {"not " if p_value < 0.05 else ""}stationary')
In [26]:
print(f'KPSS for original series')
print(f'Consumption:')
print(kpss_test(consumption_train.consumption))
print(f'Imports:')
print(kpss_test(imports_train.imports))
print(f'Exports:')
print(kpss_test(exports_train.exports))
print(f'Investments:')
print(kpss_test(investments_train.investments))
print(f'Government Expenditures:')
print(kpss_test(government_expenditure_train.government_expenditure))
KPSS for original series
Consumption:
p-value               = 0.01
The series is not stationary
None
Imports:
p-value               = 0.01
The series is not stationary
None
Exports:
p-value               = 0.01
The series is not stationary
None
Investments:
p-value               = 0.01
The series is not stationary
None
Government Expenditures:
p-value               = 0.01
The series is not stationary
None
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1907: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  warn_msg.format(direction="smaller"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1907: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  warn_msg.format(direction="smaller"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1907: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  warn_msg.format(direction="smaller"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1907: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  warn_msg.format(direction="smaller"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1907: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  warn_msg.format(direction="smaller"), InterpolationWarning

Removing trend

In [27]:
#Taking the first difference to remove overall trend
gdp_train['gdp_diff'] = gdp_train.gdp.diff().dropna(inplace=False)
consumption_train['consumption_diff'] = gdp_train.gdp.diff().dropna(inplace=False)
imports_train['imports_diff'] = imports_train.imports.diff().dropna(inplace=False)
exports_train['exports_diff'] = exports_train.exports.diff().dropna(inplace=False)
investments_train['investments_diff'] = investments_train.investments.diff().dropna(inplace=False)
government_expenditure_train['government_expenditure_diff'] = government_expenditure_train.government_expenditure.diff().dropna(inplace=False)

Plots of first differenced series

Plots of Consumption

In [28]:
plt.rcParams["figure.figsize"] = (10,6)
plt.plot(consumption_train.consumption_diff)
plt.title('First differenced Consumption')
Out[28]:
Text(0.5, 1.0, 'First differenced Consumption')
In [29]:
sm.graphics.tsa.plot_acf(consumption_train.consumption_diff.dropna(), lags=40, alpha = 0.05, title='ACF of Consumption');

Plots of Imports, Exports, Investments and Government Expenditures

In [30]:
fig, axs = plt.subplots(2, 2, figsize=(18,10), sharex=False, sharey=False)
fig.suptitle('First differenced series')
axs[0, 0].plot(imports_train.imports_diff)
axs[0, 0].set_title('Imports')
axs[0, 1].plot(exports_train.exports_diff)
axs[0, 1].set_title('Exports')
axs[1, 0].plot(investments_train.investments_diff)
axs[1, 0].set_title('Investments')
axs[1, 1].plot(government_expenditure_train.government_expenditure_diff)
axs[1, 1].set_title('Government expenditure')
Out[30]:
Text(0.5, 1.0, 'Government expenditure')
In [31]:
fig, ax=plt.subplots(2,2, figsize=(18,10))
fig.suptitle('Autocorrelation of first differenced series')
sm.graphics.tsa.plot_acf(imports_train.imports_diff.dropna(), lags=40, alpha = 0.05, ax=ax[0,0], title='ACF of Imports')
sm.graphics.tsa.plot_acf(exports_train.exports_diff.dropna(), lags=40, alpha = 0.05, ax=ax[0,1], title='ACF of Exports')
sm.graphics.tsa.plot_acf(investments_train.investments_diff.dropna(), lags=40, alpha = 0.05, ax=ax[1,0], title='ACF of Investments')
sm.graphics.tsa.plot_acf(government_expenditure_train.government_expenditure_diff.dropna(), lags=40, alpha = 0.05, ax=ax[1,1], title='ACF of Government Expenditure');

Stationarity test of the first differenced series

ADF test

In [32]:
def adfuller_test(series, signif=0.05, name='', verbose=False):

    r = adfuller(series, autolag='AIC', regression = 'c') # Default: Regression = 'c' (constant only)
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    print(f' p-value               = {p_value}')
    if p_value <= signif:
        print(f" Rejecting Null Hypothesis => Series is Stationary.")
    else:
        print(f" Weak evidence to reject the Null Hypothesis => Series is non-stationary.")
In [33]:
print(f'ADF for first differenced series')
print(f'Consumption:')
print(adfuller_test(consumption_train.consumption_diff.dropna()))
print(f'Imports:')
print(adfuller_test(imports_train.imports_diff.dropna()))
print(f'Exports:')
print(adfuller_test(exports_train.exports_diff.dropna()))
print(f'Investments:')
print(adfuller_test(investments_train.investments_diff.dropna()))
print(f'government expenditure:')
print(adfuller_test(government_expenditure_train.government_expenditure_diff.dropna()))
ADF for first differenced series
Consumption:
 p-value               = 0.0035
 Rejecting Null Hypothesis => Series is Stationary.
None
Imports:
 p-value               = 0.0004
 Rejecting Null Hypothesis => Series is Stationary.
None
Exports:
 p-value               = 0.0028
 Rejecting Null Hypothesis => Series is Stationary.
None
Investments:
 p-value               = 0.002
 Rejecting Null Hypothesis => Series is Stationary.
None
government expenditure:
 p-value               = 0.0415
 Rejecting Null Hypothesis => Series is Stationary.
None

KPSS test

In [34]:
print(f'KPSS for first differenced series')
print(f'Consumption:')
print(kpss_test(consumption_train.consumption_diff.dropna()))
print(f'Imports:')
print(kpss_test(imports_train.imports_diff.dropna()))
print(f'Exports:')
print(kpss_test(exports_train.exports_diff.dropna()))
print(f'Investments:')
print(kpss_test(investments_train.investments_diff.dropna()))
print(f'government expenditure:')
print(kpss_test(government_expenditure_train.government_expenditure_diff.dropna()))
KPSS for first differenced series
Consumption:
p-value               = 0.1
The series is stationary
None
Imports:
p-value               = 0.1
The series is stationary
None
Exports:
p-value               = 0.1
The series is stationary
None
Investments:
p-value               = 0.1
The series is stationary
None
government expenditure:
p-value               = 0.1
The series is stationary
None
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning

Removing seasonality

In [35]:
consumption_train['consumption_season'] = consumption_train.consumption_diff.diff(4).dropna()
imports_train['imports_season'] = imports_train.imports_diff.diff(4).dropna()
exports_train['exports_season'] = exports_train.exports_diff.diff(4).dropna()
investments_train['investments_season'] = investments_train.investments_diff.diff(4).dropna()
government_expenditure_train['government_expenditure_season'] = government_expenditure_train.government_expenditure_diff.diff(4).dropna()

Plots of first and seasonality differenced series

Plots of Consumption

In [36]:
plt.rcParams["figure.figsize"] = (10,6)
plt.plot(consumption_train.consumption_season)
Out[36]:
[<matplotlib.lines.Line2D at 0x7f79bf74ead0>]
In [37]:
sm.graphics.tsa.plot_acf(consumption_train.consumption_season.dropna(), lags=40, alpha = 0.05, title='ACF of consumption');

Plots of Imports, Exports, Investments and Government Expenditures

In [38]:
fig, axs = plt.subplots(2, 2, figsize=(18,10), sharex=False, sharey=False)
fig.suptitle('First and seasonality differenced series')
axs[0, 0].plot(imports_train.imports_season)
axs[0, 0].set_title('Imports')
axs[0, 1].plot(exports_train.exports_season)
axs[0, 1].set_title('Exports')
axs[1, 0].plot(investments_train.investments_season)
axs[1, 0].set_title('Investments')
axs[1, 1].plot(government_expenditure_train.government_expenditure_season)
axs[1, 1].set_title('government expenditure')
Out[38]:
Text(0.5, 1.0, 'government expenditure')
In [39]:
fig, ax=plt.subplots(2,2, figsize=(18,10))
fig.suptitle('Autocorrelation of first and seasonality differenced series')
sm.graphics.tsa.plot_acf(imports_train.imports_season.dropna(), lags=40, alpha = 0.05, ax=ax[0,0], title='ACF of Imports')
sm.graphics.tsa.plot_acf(exports_train.exports_season.dropna(), lags=40, alpha = 0.05, ax=ax[0,1], title='ACF of Exports')
sm.graphics.tsa.plot_acf(investments_train.investments_season.dropna(), lags=40, alpha = 0.05, ax=ax[1,0], title='ACF of Investments')
sm.graphics.tsa.plot_acf(government_expenditure_train.government_expenditure_season.dropna(), lags=40, alpha = 0.05, ax=ax[1,1], title='ACF of Government Expenditure');

Stationarity test of the first and seasonality differenced series

ADF test

In [40]:
def adfuller_test(series, signif=0.05, name='', verbose=False):

    r = adfuller(series, autolag='AIC', regression = 'nc')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    print(f' p-value               = {p_value}')
    if p_value <= signif:
        print(f" Rejecting Null Hypothesis => Series is Stationary.")
    else:
        print(f" Weak evidence to reject the Null Hypothesis => Series is non-stationary.")
In [41]:
print(f'ADF for first and seasonality differenced series')
print(f'Consumption:')
print(adfuller_test(consumption_train.consumption_season.dropna()))
print(f'Imports:')
print(adfuller_test(imports_train.imports_season.dropna()))
print(f'Exports:')
print(adfuller_test(exports_train.exports_season.dropna()))
print(f'Investments:')
print(adfuller_test(investments_train.investments_season.dropna()))
print(f'government expenditure:')
print(adfuller_test(government_expenditure_train.government_expenditure_season.dropna()))
ADF for first and seasonality differenced series
Consumption:
 p-value               = 0.0
 Rejecting Null Hypothesis => Series is Stationary.
None
Imports:
 p-value               = 0.0
 Rejecting Null Hypothesis => Series is Stationary.
None
Exports:
 p-value               = 0.0
 Rejecting Null Hypothesis => Series is Stationary.
None
Investments:
 p-value               = 0.0
 Rejecting Null Hypothesis => Series is Stationary.
None
government expenditure:
 p-value               = 0.0003
 Rejecting Null Hypothesis => Series is Stationary.
None

KPSS test

In [42]:
print(f'KPSS for first differenced series')
print(f'Consumption:')
print(kpss_test(consumption_train.consumption_season.dropna()))
print(f'Imports:')
print(kpss_test(imports_train.imports_season.dropna()))
print(f'Exports:')
print(kpss_test(exports_train.exports_season.dropna()))
print(f'Investments:')
print(kpss_test(investments_train.investments_season.dropna()))
print(f'government expenditure:')
print(kpss_test(government_expenditure_train.government_expenditure_season.dropna()))
KPSS for first differenced series
Consumption:
p-value               = 0.1
The series is stationary
None
Imports:
p-value               = 0.1
The series is stationary
None
Exports:
p-value               = 0.1
The series is stationary
None
Investments:
p-value               = 0.1
The series is stationary
None
government expenditure:
p-value               = 0.1
The series is stationary
None
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/stattools.py:1911: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  warn_msg.format(direction="greater"), InterpolationWarning

ACF and PACF of the first and seasonality differenced series

Plots for Consumption

In [43]:
fig, ax = plt.subplots(1,2,figsize=(16,4))
sm.graphics.tsa.plot_acf(consumption_train.consumption_season.dropna(), lags=40, alpha = 0.05, ax=ax[0], title= 'ACF of Consumption')
sm.graphics.tsa.plot_pacf(consumption_train.consumption_season.dropna(), lags=40, alpha=0.05, ax=ax[1], title='PACF of Consumption')
plt.show()

Plots for Imports, Exports, Investments and Government Expenditures

In [44]:
fig, ax = plt.subplots(4,2,figsize=(19,14))
sm.graphics.tsa.plot_acf(imports_train.imports_season.dropna(), lags=40, alpha = 0.05, ax=ax[0,0], title='ACF of Imports')
sm.graphics.tsa.plot_pacf(imports_train.imports_season.dropna(), lags=40, alpha=0.05, ax=ax[0,1], title='PACF of Imports')
sm.graphics.tsa.plot_acf(exports_train.exports_season.dropna(), lags=40, alpha = 0.05, ax=ax[1,0], title='ACF of Exports')
sm.graphics.tsa.plot_pacf(exports_train.exports_season.dropna(), lags=40, alpha=0.05, ax=ax[1,1], title='PACF of Exports')
sm.graphics.tsa.plot_acf(investments_train.investments_season.dropna(), lags=40, alpha = 0.05, ax=ax[2,0], title='ACF of Investments')
sm.graphics.tsa.plot_pacf(investments_train.investments_season.dropna(), lags=40, alpha=0.05, ax=ax[2,1], title='PACF of Investments')
sm.graphics.tsa.plot_acf(government_expenditure_train.government_expenditure_season.dropna(), lags=40, alpha = 0.05, ax=ax[3,0], title='ACF of Government Expenditure')
sm.graphics.tsa.plot_pacf(government_expenditure_train.government_expenditure_season.dropna(), lags=40, alpha=0.05, ax=ax[3,1], title='PACF of Government Expenditure')
plt.show()

Construction of SARIMA models

In [45]:
from pmdarima import auto_arima
from pmdarima import ARIMA
In [46]:
imports_best = auto_arima(imports_train.imports.dropna(), start_p=0, d=1, start_q=0, max_p=3, max_q=3,      start_P=0, D=1, start_Q=0, max_P=3, max_Q=3,m=4, seasonal=True, with_intercept=False, information_criterion='aic')
exports_best = auto_arima(exports_train.exports.dropna(), start_p=0, d=1, start_q=0, max_p=3, max_q=3,      start_P=0, D=1, start_Q=0, max_P=3, max_Q=3,m=4, seasonal=True, with_intercept=False, information_criterion='aic')
investments_best = auto_arima(investments_train.investments.dropna(), start_p=0, d=1, start_q=0, max_p=3, max_q=3,      start_P=0, D=1, start_Q=0, max_P=3, max_Q=3,m=4, seasonal=True, with_intercept=False, information_criterion='aic')
government_expenditure_best = auto_arima(government_expenditure_train.government_expenditure.dropna(), start_p=0, d=1, start_q=0, max_p=3, max_q=3,      start_P=0, D=1, start_Q=0, max_P=3, max_Q=3,m=4, seasonal=True, with_intercept=False, information_criterion='aic')
consumption_best = auto_arima(consumption_train.consumption.dropna(), start_p=0, d=1, start_q=0, max_p=3, max_q=3,      start_P=0, D=1, start_Q=0, max_P=3, max_Q=3,m=4, seasonal=True, with_intercept=False, information_criterion='aic')
In [47]:
print(f'Consumption'), print(consumption_best)
print(f'Imports'), print(imports_best)
print(f'Exports'),print(exports_best)
print(f'Investments'), print(investments_best)
print(f'government_expenditure'), print(government_expenditure_best)
Consumption
 ARIMA(0,1,1)(0,1,1)[4]          
Imports
 ARIMA(1,1,0)(0,1,1)[4]          
Exports
 ARIMA(1,1,0)(0,1,1)[4]          
Investments
 ARIMA(0,1,2)(0,1,1)[4]          
government_expenditure
 ARIMA(0,1,1)(0,1,1)[4]          
Out[47]:
(None, None)
In [48]:
print(f'Consumption'), print(consumption_best.summary())
print(f'Imports'), print(imports_best.summary())
print(f'Exports'), print(exports_best.summary())
print(f'Investments'), print(investments_best.summary())
print(f'government_expenditure'), print(government_expenditure_best.summary())
Consumption
                                     SARIMAX Results                                     
=========================================================================================
Dep. Variable:                                 y   No. Observations:                  118
Model:             SARIMAX(0, 1, 1)x(0, 1, 1, 4)   Log Likelihood                -268.745
Date:                           Fri, 28 May 2021   AIC                            543.490
Time:                                   14:58:53   BIC                            551.672
Sample:                                        0   HQIC                           546.810
                                           - 118                                         
Covariance Type:                             opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1         -0.2001      0.069     -2.913      0.004      -0.335      -0.065
ma.S.L4       -0.6467      0.074     -8.736      0.000      -0.792      -0.502
sigma2         6.6799      0.684      9.773      0.000       5.340       8.020
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):                37.23
Prob(Q):                              0.95   Prob(JB):                         0.00
Heteroskedasticity (H):               0.97   Skew:                            -0.63
Prob(H) (two-sided):                  0.93   Kurtosis:                         5.51
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Imports
                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                                   y   No. Observations:                  118
Model:             SARIMAX(1, 1, 0)x(0, 1, [1], 4)   Log Likelihood                -370.086
Date:                             Fri, 28 May 2021   AIC                            746.172
Time:                                     14:58:53   BIC                            754.354
Sample:                                          0   HQIC                           749.492
                                             - 118                                         
Covariance Type:                               opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.1798      0.079      2.277      0.023       0.025       0.335
ma.S.L4       -0.9080      0.057    -15.943      0.000      -1.020      -0.796
sigma2        38.4965      3.297     11.675      0.000      32.034      44.959
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):               324.63
Prob(Q):                              0.94   Prob(JB):                         0.00
Heteroskedasticity (H):               4.31   Skew:                            -1.50
Prob(H) (two-sided):                  0.00   Kurtosis:                        10.74
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Exports
                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                                   y   No. Observations:                  118
Model:             SARIMAX(1, 1, 0)x(0, 1, [1], 4)   Log Likelihood                -377.953
Date:                             Fri, 28 May 2021   AIC                            761.907
Time:                                     14:58:53   BIC                            770.089
Sample:                                          0   HQIC                           765.227
                                             - 118                                         
Covariance Type:                               opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.1997      0.084      2.374      0.018       0.035       0.365
ma.S.L4       -0.8574      0.056    -15.352      0.000      -0.967      -0.748
sigma2        44.8892      4.731      9.489      0.000      35.617      54.161
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):                52.90
Prob(Q):                              0.97   Prob(JB):                         0.00
Heteroskedasticity (H):               2.51   Skew:                            -0.54
Prob(H) (two-sided):                  0.01   Kurtosis:                         6.17
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Investments
                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                                   y   No. Observations:                  118
Model:             SARIMAX(0, 1, 2)x(0, 1, [1], 4)   Log Likelihood                -349.293
Date:                             Fri, 28 May 2021   AIC                            706.585
Time:                                     14:58:53   BIC                            717.495
Sample:                                          0   HQIC                           711.012
                                             - 118                                         
Covariance Type:                               opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1         -0.3646      0.063     -5.759      0.000      -0.489      -0.241
ma.L2          0.2097      0.136      1.546      0.122      -0.056       0.476
ma.S.L4       -0.7169      0.067    -10.702      0.000      -0.848      -0.586
sigma2        27.5832      2.797      9.860      0.000      22.101      33.066
===================================================================================
Ljung-Box (L1) (Q):                   0.03   Jarque-Bera (JB):                63.29
Prob(Q):                              0.86   Prob(JB):                         0.00
Heteroskedasticity (H):               1.97   Skew:                            -0.66
Prob(H) (two-sided):                  0.04   Kurtosis:                         6.42
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
government_expenditure
                                     SARIMAX Results                                     
=========================================================================================
Dep. Variable:                                 y   No. Observations:                  118
Model:             SARIMAX(0, 1, 1)x(0, 1, 1, 4)   Log Likelihood                -178.644
Date:                           Fri, 28 May 2021   AIC                            363.288
Time:                                   14:58:53   BIC                            371.470
Sample:                                        0   HQIC                           366.608
                                           - 118                                         
Covariance Type:                             opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1         -0.1716      0.070     -2.453      0.014      -0.309      -0.034
ma.S.L4       -0.4761      0.084     -5.700      0.000      -0.640      -0.312
sigma2         1.3697      0.128     10.685      0.000       1.118       1.621
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):                65.11
Prob(Q):                              0.97   Prob(JB):                         0.00
Heteroskedasticity (H):               5.65   Skew:                            -0.84
Prob(H) (two-sided):                  0.00   Kurtosis:                         6.32
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Out[48]:
(None, None)

Diagnostics

Roots of the AR and MA polynomials

In [49]:
import statsmodels.api as sm
np.set_printoptions(suppress=True) #Avoiding scientific notation

Roots for Consumption

In [50]:
MA_consumption=consumption_best.maroots()
inverse_MA_consumption=1/MA_consumption

AR_consumption=consumption_best.arroots()
inverse_AR_consumption=1/AR_consumption

print(f'MA roots'), print(pd.DataFrame(inverse_MA_consumption))
print(f'AR roots'), print(pd.DataFrame(inverse_AR_consumption))
MA roots
                    0
0 -0.896749-0.000000j
1 -0.000000+0.896749j
2 -0.000000-0.896749j
3  0.896749+0.000000j
4  0.200057+0.000000j
AR roots
Empty DataFrame
Columns: [0]
Index: []
Out[50]:
(None, None)
In [51]:
%pylab inline

#plot unit circle
t = linspace(0,2*pi,101) 
plot(cos(t),sin(t))
plot([0,0],[-1.1,1.1],'k',linewidth=0.5)
plot([-1.1,1.1],[0,0],'k',linewidth=0.5)
plt.title('Inverse MA roots')
axes().set_aspect('equal')

x = [ele.real for ele in inverse_MA_consumption]
y = [ele.imag for ele in inverse_MA_consumption]
plt.scatter(x, y, color='red')
plt.ylabel('Imaginary')
plt.xlabel('Real')
Populating the interactive namespace from numpy and matplotlib
Out[51]:
Text(0.5, 0, 'Real')

Roots for Imports

In [52]:
MA_imports=imports_best.maroots()
inverse_MA_imports=1/MA_imports

AR_imports=imports_best.arroots()
inverse_AR_imports=1/AR_imports

print(f'MA roots'), print(pd.DataFrame(inverse_MA_imports))
print(f'AR roots'), print(pd.DataFrame(inverse_AR_imports))
MA roots
                    0
0 -0.976164-0.000000j
1 -0.000000+0.976164j
2 -0.000000-0.976164j
3  0.976164+0.000000j
AR roots
          0
0  0.179802
Out[52]:
(None, None)
In [53]:
#plot unit circle
t = linspace(0,2*pi,101)
plot(cos(t),sin(t))
plot([0,0],[-1.1,1.1],'k',linewidth=0.5)
plot([-1.1,1.1],[0,0],'k',linewidth=0.5)
plt.title('Inverse AR roots')
axes().set_aspect('equal')

x = [ele.real for ele in inverse_AR_imports]
y = [ele.imag for ele in inverse_AR_imports]
plt.scatter(x, y, color='red')
plt.ylabel('Imaginary')
plt.xlabel('Real')
Out[53]:
Text(0.5, 0, 'Real')

Roots for Exports

In [54]:
MA_exports=exports_best.maroots()
inverse_MA_exports=1/MA_exports

AR_exports=exports_best.arroots()
inverse_AR_exports=1/AR_exports

print(f'MA roots'), print(pd.DataFrame(inverse_MA_exports))
print(f'AR roots'), print(pd.DataFrame(inverse_AR_exports))
MA roots
                    0
0 -0.962276-0.000000j
1 -0.000000+0.962276j
2 -0.000000-0.962276j
3  0.962276+0.000000j
AR roots
          0
0  0.199749
Out[54]:
(None, None)
In [55]:
#plot unit circle
t = linspace(0,2*pi,101)
plot(cos(t),sin(t))
plot([0,0],[-1.1,1.1],'k',linewidth=0.5)
plot([-1.1,1.1],[0,0],'k',linewidth=0.5)
plt.title('Inverse MA roots')
axes().set_aspect('equal')

x = [ele.real for ele in inverse_MA_exports]
y = [ele.imag for ele in inverse_MA_exports]
plt.scatter(x, y, color='red')
plt.ylabel('Imaginary')
plt.xlabel('Real')
Out[55]:
Text(0.5, 0, 'Real')
In [56]:
#plot unit circle
t = linspace(0,2*pi,101)
plot(cos(t),sin(t))
plot([0,0],[-1.1,1.1],'k',linewidth=0.5)
plot([-1.1,1.1],[0,0],'k',linewidth=0.5)
plt.title('Inverse AR roots')
axes().set_aspect('equal')

x = [ele.real for ele in inverse_AR_exports]
y = [ele.imag for ele in inverse_AR_exports]
plt.scatter(x, y, color='red')
plt.ylabel('Imaginary')
plt.xlabel('Real')
Out[56]:
Text(0.5, 0, 'Real')

Roots for Investments

In [57]:
MA_investments=investments_best.maroots()
inverse_MA_investments=1/MA_investments

AR_investments=investments_best.arroots()
inverse_AR_investments=1/AR_investments

print(f'MA roots'), print(pd.DataFrame(inverse_MA_investments))
print(f'AR roots'), print(pd.DataFrame(inverse_AR_investments))
MA roots
                    0
0 -0.920154-0.000000j
1  0.920154+0.000000j
2  0.000000+0.920154j
3  0.000000-0.920154j
4  0.182310+0.420099j
5  0.182310-0.420099j
AR roots
Empty DataFrame
Columns: [0]
Index: []
Out[57]:
(None, None)
In [58]:
#plot unit circle
t = linspace(0,2*pi,101)
plot(cos(t),sin(t))
plot([0,0],[-1.1,1.1],'k',linewidth=0.5)
plot([-1.1,1.1],[0,0],'k',linewidth=0.5)
plt.title('Inverse MA roots')
axes().set_aspect('equal')

x = [ele.real for ele in inverse_MA_investments]
y = [ele.imag for ele in inverse_MA_investments]
plt.scatter(x, y, color='red')
plt.ylabel('Imaginary')
plt.xlabel('Real')
Out[58]:
Text(0.5, 0, 'Real')

Roots for Government Expenditure

In [59]:
MA_government_expenditure=government_expenditure_best.maroots()
inverse_MA_government_expenditure=1/MA_government_expenditure

AR_government_expenditure=government_expenditure_best.arroots()
inverse_AR_government_expenditure=1/AR_government_expenditure

print(f'MA roots'), print(pd.DataFrame(inverse_MA_government_expenditure))
print(f'AR roots'), print(pd.DataFrame(inverse_AR_government_expenditure))
MA roots
                    0
0 -0.830656-0.000000j
1 -0.000000+0.830656j
2 -0.000000-0.830656j
3  0.830656+0.000000j
4  0.171561+0.000000j
AR roots
Empty DataFrame
Columns: [0]
Index: []
Out[59]:
(None, None)
In [60]:
#plot unit circle
t = linspace(0,2*pi,101)
plot(cos(t),sin(t))
plot([0,0],[-1.1,1.1],'k',linewidth=0.5)
plot([-1.1,1.1],[0,0],'k',linewidth=0.5)
plt.title('Inverse MA roots')
axes().set_aspect('equal')

x = [ele.real for ele in inverse_MA_government_expenditure]
y = [ele.imag for ele in inverse_MA_government_expenditure]
plt.scatter(x, y, color='red')
plt.ylabel('Imaginary')
plt.xlabel('Real')
Out[60]:
Text(0.5, 0, 'Real')

Residuals

In [61]:
plt.rcParams["figure.figsize"] = (20,3)
%matplotlib inline
plt.style.use('fivethirtyeight')
#import seaborn as sns
sns.set(rc={'figure.figsize':(11, 4)})

Residuals for Consumption

In [62]:
consumption_best.plot_diagnostics(figsize=(15,12));

Residuals for Imports

In [63]:
imports_best.plot_diagnostics(figsize=(15,12));

Residuals for Exports

In [64]:
exports_best.plot_diagnostics(figsize=(15,12));

Residuals for Investments

In [65]:
investments_best.plot_diagnostics(figsize=(15,12));

Residuals for Government Expenditures

In [66]:
government_expenditure_best.plot_diagnostics(figsize=(15,12));
Ljung-Box test
In [67]:
from statsmodels.stats.diagnostic import acorr_ljungbox
LB_consumption = acorr_ljungbox(consumption_best.resid(), boxpierce=False, lags=8, return_df=True)
LB_imports = acorr_ljungbox(imports_best.resid(), boxpierce=False, lags = 8, return_df=True) # Nulhypotese = Ingen seriel korrelation. Lille p-værdi = der er korrelation!
LB_exports = acorr_ljungbox(exports_best.resid(), boxpierce=False, lags = 8, return_df=True)
LB_investments = acorr_ljungbox(investments_best.resid(), boxpierce=False, lags = 8, return_df=True)
LB_government_expenditure = acorr_ljungbox(government_expenditure_best.resid(), boxpierce=False, lags = 8, return_df=True)
print(f'Consumption', LB_consumption)
print(f'Imports', LB_imports)
print(f'Exports', LB_exports)
print(f'Investments', LB_investments)
print(f'Government Expenditures', LB_government_expenditure)
Consumption      lb_stat  lb_pvalue
1   0.057893   0.809857
2   0.057951   0.971440
3   0.103868   0.991369
4  19.397589   0.000656
5  19.433116   0.001596
6  19.433824   0.003490
7  19.441129   0.006912
8  19.525428   0.012289
Imports      lb_stat  lb_pvalue
1   0.056360   0.812345
2   0.057472   0.971673
3   0.619740   0.891899
4  10.621834   0.031160
5  10.757051   0.056416
6  10.809945   0.094431
7  10.931884   0.141618
8  11.097521   0.196235
Exports      lb_stat  lb_pvalue
1   0.074478   0.784925
2   0.080842   0.960385
3   0.110294   0.990574
4  12.843794   0.012065
5  12.868674   0.024641
6  13.132185   0.040984
7  13.165928   0.068167
8  13.424483   0.098055
Investments     lb_stat  lb_pvalue
1  0.076272   0.782415
2  0.726580   0.695385
3  0.883242   0.829470
4  3.514931   0.475612
5  4.094936   0.535830
6  4.109728   0.661830
7  4.411829   0.731307
8  4.510285   0.808403
Government Expenditures      lb_stat  lb_pvalue
1   0.004395   0.947141
2   0.005173   0.997417
3   0.100833   0.991737
4  16.209853   0.002750
5  16.225963   0.006228
6  16.228738   0.012577
7  16.279769   0.022680
8  16.304341   0.038226

Predictions and forecasts

for Consumption

In [68]:
consumption_train['arima_model'] = consumption_best.arima_res_.fittedvalues
consumption_train['arima_model'][:4+1] = np.NaN
consumption['consumption'][:4+1] = np.NaN
forecast_consumption = pd.DataFrame(consumption_best.predict(n_periods=6))
forecast_consumption['Time'] = forecast_consumption['Time']=pd.period_range('2019-07', periods=6, freq='Q')
forecast_consumption=forecast_consumption.set_index('Time')
forecast_consumption.index = forecast_consumption.index.to_timestamp()
forecast_consumption=forecast_consumption.squeeze() #Converting to a series

forecast_consumption = consumption_train['arima_model'].append(forecast_consumption)
plt.figure(figsize=(15, 7.5))
plt.plot(forecast_consumption, color='r', label='model')
plt.axvspan(consumption_train.index[-1], forecast_consumption.index[-1], alpha=0.5, color='lightgrey')
plt.plot(consumption['consumption'], label='actual')
plt.title('Consumption')
plt.legend()
plt.show()

for Imports

In [69]:
imports_train['arima_model'] = imports_best.arima_res_.fittedvalues
imports_train['arima_model'][:4+1] = np.NaN
imports['imports'][:4+1] = np.NaN
forecast_imports = pd.DataFrame(imports_best.predict(n_periods=6))
forecast_imports['Time'] = forecast_imports['Time']=pd.period_range('2019-07', periods=6, freq='Q')
forecast_imports=forecast_imports.set_index('Time')
forecast_imports.index = forecast_imports.index.to_timestamp()
forecast_imports=forecast_imports.squeeze() #Converting to a series

forecast_imports = imports_train['arima_model'].append(forecast_imports)
plt.figure(figsize=(15, 7.5))
plt.plot(forecast_imports, color='r', label='model')
plt.axvspan(imports_train.index[-1], forecast_imports.index[-1], alpha=0.5, color='lightgrey')
plt.plot(imports['imports'], label='actual')
plt.title('imports')
plt.legend()
plt.show()

for Exports

In [70]:
exports_train['arima_model'] = exports_best.arima_res_.fittedvalues
exports_train['arima_model'][:4+1] = np.NaN
exports['exports'][:4+1] = np.NaN
forecast_exports = pd.DataFrame(exports_best.predict(n_periods=6))
forecast_exports['Time'] = forecast_exports['Time']=pd.period_range('2019-07', periods=6, freq='Q')
forecast_exports=forecast_exports.set_index('Time')
forecast_exports.index = forecast_exports.index.to_timestamp()
forecast_exports=forecast_exports.squeeze() #Converting to a series

forecast_exports = exports_train['arima_model'].append(forecast_exports)
plt.figure(figsize=(15, 7.5))
plt.plot(forecast_exports, color='r', label='model')
plt.axvspan(exports_train.index[-1], forecast_exports.index[-1], alpha=0.5, color='lightgrey')
plt.plot(exports['exports'], label='actual')
plt.title('exports')
plt.legend()
plt.show()

for Investments

In [71]:
investments_train['arima_model'] = investments_best.arima_res_.fittedvalues
investments_train['arima_model'][:4+1] = np.NaN
investments['investments'][:4+1] = np.NaN
forecast_investments = pd.DataFrame(investments_best.predict(n_periods=6))
forecast_investments['Time'] = forecast_investments['Time']=pd.period_range('2019-07', periods=6, freq='Q')
forecast_investments=forecast_investments.set_index('Time')
forecast_investments.index = forecast_investments.index.to_timestamp()
forecast_investments=forecast_investments.squeeze() #Converting to a series

forecast_investments = investments_train['arima_model'].append(forecast_investments)
plt.figure(figsize=(15, 7.5))
plt.plot(forecast_investments, color='r', label='model')
plt.axvspan(investments_train.index[-1], forecast_investments.index[-1], alpha=0.5, color='lightgrey')
plt.plot(investments['investments'], label='actual')
plt.title('investments')
plt.legend()
plt.show()

for Government Expenditures

In [72]:
government_expenditure_train['arima_model'] = government_expenditure_best.arima_res_.fittedvalues
government_expenditure_train['arima_model'][:4+1] = np.NaN
government_expenditure['government_expenditure'][:4+1] = np.NaN
forecast_government_expenditure = pd.DataFrame(government_expenditure_best.predict(n_periods=6))
forecast_government_expenditure['Time'] = forecast_government_expenditure['Time']=pd.period_range('2019-07', periods=6, freq='Q')
forecast_government_expenditure=forecast_government_expenditure.set_index('Time')
forecast_government_expenditure.index = forecast_government_expenditure.index.to_timestamp()
forecast_government_expenditure=forecast_government_expenditure.squeeze() #Converting to a series

forecast_government_expenditure = government_expenditure_train['arima_model'].append(forecast_government_expenditure)
plt.figure(figsize=(15, 7.5))
plt.plot(forecast_government_expenditure, color='r', label='model')
plt.axvspan(government_expenditure_train.index[-1], forecast_government_expenditure.index[-1], alpha=0.5, color='lightgrey')
plt.plot(government_expenditure['government_expenditure'], label='actual')
plt.title('government_expenditure')
plt.legend()
plt.show()

RMSEs of predictions and forecasts

In [73]:
from sklearn.metrics import mean_squared_error
import math

RMSE for Consumption

In [74]:
true_train_consumption = consumption_train.consumption.iloc[1:]
predicted_train_consumption = pd.DataFrame(consumption_best.arima_res_.fittedvalues).iloc[1:]
rmse_train_consumption = math.sqrt(mean_squared_error(true_train_consumption, predicted_train_consumption))

forecast_values_consumption_ARIMA = forecast_consumption.iloc[118:]
true_values_consumption_ARIMA = consumption_test.consumption
rmse_test_consumption = math.sqrt(mean_squared_error(true_values_consumption_ARIMA, forecast_values_consumption_ARIMA))

print('Train Score: %.2f RMSE' % (rmse_train_consumption))
print('Test Score: %.2f RMSE' % (rmse_test_consumption))
Train Score: 5.70 RMSE
Test Score: 11.81 RMSE

RMSE for Imports

In [75]:
true_train_imports = imports_train.imports.iloc[1:]
predicted_train_imports = pd.DataFrame(imports_best.arima_res_.fittedvalues).iloc[1:]
rmse_train_imports = math.sqrt(mean_squared_error(true_train_imports, predicted_train_imports))

forecast_values_imports_ARIMA = forecast_imports.iloc[118:]
true_values_imports_ARIMA = imports_test.imports
rmse_test_imports = math.sqrt(mean_squared_error(true_values_imports_ARIMA, forecast_values_imports_ARIMA))

print('Train Score: %.2f RMSE' % (rmse_train_imports))
print('Test Score: %.2f RMSE' % (rmse_test_imports))
Train Score: 7.04 RMSE
Test Score: 23.06 RMSE

RMSE for Exports

In [76]:
true_train_exports = exports_train.exports.iloc[1:]
predicted_train_exports = pd.DataFrame(exports_best.arima_res_.fittedvalues).iloc[1:]
rmse_train_exports = math.sqrt(mean_squared_error(true_train_exports, predicted_train_exports))

forecast_values_exports_ARIMA = forecast_exports.iloc[118:]
true_values_exports_ARIMA = exports_test.exports
rmse_test_exports = math.sqrt(mean_squared_error(true_values_exports_ARIMA, forecast_values_exports_ARIMA))

print('Train Score: %.2f RMSE' % (rmse_train_exports))
print('Test Score: %.2f RMSE' % (rmse_test_exports))
Train Score: 7.81 RMSE
Test Score: 34.50 RMSE

RMSE for Investments

In [77]:
true_train_investments = investments_train.investments.iloc[1:]
predicted_train_investments = pd.DataFrame(investments_best.arima_res_.fittedvalues).iloc[1:]
rmse_train_investments = math.sqrt(mean_squared_error(true_train_investments, predicted_train_investments))

forecast_values_investments_ARIMA = forecast_investments.iloc[118:]
true_values_investments_ARIMA = investments_test.investments
rmse_test_investments = math.sqrt(mean_squared_error(true_values_investments_ARIMA, forecast_values_investments_ARIMA))

print('Train Score: %.2f RMSE' % (rmse_train_investments))
print('Test Score: %.2f RMSE' % (rmse_test_investments))
Train Score: 5.45 RMSE
Test Score: 6.71 RMSE

RMSE for Government Expenditures

In [78]:
true_train_government_expenditure = government_expenditure_train.government_expenditure.iloc[1:]
predicted_train_government_expenditure = pd.DataFrame(government_expenditure_best.arima_res_.fittedvalues).iloc[1:]
rmse_train_government_expenditure = math.sqrt(mean_squared_error(true_train_government_expenditure, predicted_train_government_expenditure))

forecast_values_government_expenditure_ARIMA = forecast_government_expenditure.iloc[118:]
true_values_government_expenditure_ARIMA = government_expenditure_test.government_expenditure
rmse_test_government_expenditure = math.sqrt(mean_squared_error(true_values_government_expenditure_ARIMA, forecast_values_government_expenditure_ARIMA))

print('Train Score: %.2f RMSE' % (rmse_train_government_expenditure))
print('Test Score: %.2f RMSE' % (rmse_test_government_expenditure))
Train Score: 2.51 RMSE
Test Score: 4.38 RMSE

Forecasting GDP

In [79]:
#Converting series objects to dataframes
forecast_consumption = forecast_consumption.to_frame('consumption')
forecast_imports = forecast_imports.to_frame('imports')
forecast_exports = forecast_exports.to_frame('exports')
forecast_investments = forecast_investments.to_frame('investments')
forecast_government_expenditure = forecast_government_expenditure.to_frame('government_expenditure')
In [80]:
#Merging all forecasts to one dataframe
data = pd.merge(forecast_consumption, forecast_imports, on='Time')
data = pd.merge(data, exports, on='Time')
data = pd.merge(data, investments, on='Time')
data = pd.merge(data, government_expenditure, on='Time')
In [81]:
#Adding the combined forecast and true gdp values to the data set
data['gdp_forecast'] = data.apply(lambda row: row.consumption - row.imports + row.exports + row.investments + row.government_expenditure, axis = 1)
data['gdp'] = gdp.gdp
data
Out[81]:
consumption imports exports investments government_expenditure gdp_forecast gdp
Time
1990-01-01 NaN NaN NaN NaN NaN NaN 210.2
1990-04-01 NaN NaN NaN NaN NaN NaN 218.1
1990-07-01 NaN NaN NaN NaN NaN NaN 209.5
1990-10-01 NaN NaN NaN NaN NaN NaN 217.8
1991-01-01 NaN NaN NaN NaN NaN NaN 220.2
... ... ... ... ... ... ... ...
2019-10-01 278.563498 298.272738 348.9 132.9 146.6 608.690760 603.2
2020-01-01 271.108741 294.485278 332.4 139.3 139.5 587.823463 576.2
2020-04-01 275.911778 301.253054 293.3 130.8 141.4 540.158724 558.6
2020-07-01 266.978667 301.521850 317.6 136.8 139.4 559.256817 572.7
2020-10-01 285.025220 307.064464 319.6 131.6 155.3 584.460756 596.9

124 rows × 7 columns

In [82]:
plt.figure(figsize=(15, 7.5))
plt.plot(data['gdp_forecast'], color='r', label='prediction')
plt.axvspan(government_expenditure_train.index[-1], forecast_government_expenditure.index[-1], alpha=0.5, color='lightgrey')
plt.plot(data['gdp'], label='actual')
plt.title('GDP')
plt.legend()
plt.show()
In [83]:
true_train_gdp = data.gdp.iloc[5:118]
predicted_train_gdp = data.gdp_forecast.iloc[5:118]
rmse_train_gdp = math.sqrt(mean_squared_error(true_train_gdp, predicted_train_gdp))

forecast_values_gdp_ARIMA = data.gdp_forecast.iloc[118:]
true_values_gdp_ARIMA = data.gdp.iloc[118:]
rmse_test_gdp = math.sqrt(mean_squared_error(true_values_gdp_ARIMA, forecast_values_gdp_ARIMA))

print('Train Score: %.2f RMSE' % (rmse_train_gdp))
print('Test Score: %.2f RMSE' % (rmse_test_gdp))
Train Score: 5.57 RMSE
Test Score: 12.06 RMSE

Artificial Neural Networks

In [84]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
In [85]:
consumption = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Privatforbrug%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','consumption'])
consumption['Time']=pd.period_range('1990-01', periods=124, freq='Q')
consumption['consumption'] = pd.to_numeric(consumption['consumption'])
consumption=consumption.set_index('Time')
consumption.index = consumption.index.to_timestamp()

exports = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Eksport%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','exports'])
exports['Time']=pd.period_range('1990-01', periods=124, freq='Q')
exports['exports'] = pd.to_numeric(exports['exports'])
exports=exports.set_index('Time')
exports.index = exports.index.to_timestamp()

imports = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Import%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','imports'])
imports['Time']=pd.period_range('1990-01', periods=124, freq='Q')
imports['imports'] = pd.to_numeric(imports['imports'])
imports=imports.set_index('Time')
imports.index = imports.index.to_timestamp()

investments = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Investment1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','investments'])
investments['Time']=pd.period_range('1990-01', periods=124, freq='Q')
investments['investments'] = pd.to_numeric(investments['investments'])
investments=investments.set_index('Time')
investments.index = investments.index.to_timestamp()

government_expenditure = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Public%20consumption1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','government_expenditure'])
government_expenditure['Time']=pd.period_range('1990-01', periods=124, freq='Q')
government_expenditure['government_expenditure'] = pd.to_numeric(government_expenditure['government_expenditure'])
government_expenditure=government_expenditure.set_index('Time')
government_expenditure.index = government_expenditure.index.to_timestamp()

gdp = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/GDP1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','gdp'])
gdp['Time']=pd.period_range('1990-01', periods=124, freq='Q')
gdp['gdp'] = pd.to_numeric(gdp['gdp'])
gdp=gdp.set_index('Time')
gdp.index = gdp.index.to_timestamp()

consumption = consumption.rename(columns= {'consumption':'y'})
imports = imports.rename(columns= {'imports':'y'})
exports = exports.rename(columns= {'exports':'y'})
investments = investments.rename(columns= {'investments':'y'})
government_expenditure = government_expenditure.rename(columns= {'government_expenditure':'y'})

A model for Consumption

In [86]:
data = consumption
In [87]:
#Creating a y variable, y+1
data['y+1_unscaled'] = data.y.shift(-1, fill_value=data.y.iloc[-1]) #shift 1 and fill the last (empty) row with previous value

# normalizing
scaler = MinMaxScaler(feature_range=(0, 1)) #Scaling between 0 and 1
data['y_scaled'] = scaler.fit_transform(data['y'].values.reshape(-1, 1))

#Creating a scaled y variable
data['y+1'] = data.y_scaled.shift(-1, fill_value=data.y_scaled.iloc[-1]) #shift 1 and fill the last (empty) row with previous value
In [88]:
# get the data as matrix
data_p = consumption.iloc[:,2:].values.astype('float32') #Only using the last to columns for modelling.

#Splitting into train and test
train, test = data_p[:118], data_p[118:]
print(len(train), len(test))

#Defining x and y
X_train = train[:,0]
y_train = train[:,1] #Consumption+1 for training set

X_test = test[:,0]
y_test = test[:,1] 

# reshape input to be [samples, time steps, features]
X_train = numpy.reshape(X_train, (X_train.shape[0], 1, 1))
X_test = numpy.reshape(X_test, (X_test.shape[0], 1, 1))
118 6
In [89]:
model = Sequential()
model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) #1 timestep and 1 feature
model.add(Dropout(0.1))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(64))
model.add(Dropout(0.1))
model.add(Dense(1))

model.compile( loss='mean_squared_error', optimizer='adam')

# fit network
history = model.fit(X_train, y_train, epochs=30, batch_size=16, validation_split=0.1, verbose=0, shuffle=False)
In [90]:
# make predictions
trainPredict = model.predict(X_train) #Predict y from x
testPredict = model.predict(X_test)
In [91]:
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
y_train = scaler.inverse_transform([y_train])

testPredict = scaler.inverse_transform(testPredict)
y_test = scaler.inverse_transform([y_test])
In [92]:
#Creating a variable "y_pred" for all the predicted values for both train and test
data.insert(2, "y_pred", np.zeros((124,1)), False) #Inserting empty column with zeros to hold the predicted values
data['y_pred'].iloc[-6:] = testPredict.flatten() #Filling the last 6 rows with the predicted test values
data['y_pred'].iloc[:118] =trainPredict.flatten()
data
Out[92]:
y y+1_unscaled y_pred y_scaled y+1
Time
1990-01-01 101.7 106.9 121.513329 0.000000 0.028889
1990-04-01 106.9 105.4 124.689682 0.028889 0.020556
1990-07-01 105.4 111.5 123.764587 0.020556 0.054444
1990-10-01 111.5 107.8 127.571892 0.054444 0.033889
1991-01-01 107.8 111.1 125.248199 0.033889 0.052222
... ... ... ... ... ...
2019-10-01 281.7 261.8 292.402191 1.000000 0.889444
2020-01-01 261.8 253.6 266.602539 0.889444 0.843889
2020-04-01 253.6 254.2 256.499420 0.843889 0.847222
2020-07-01 254.2 276.2 257.228210 0.847222 0.969444
2020-10-01 276.2 276.2 285.090820 0.969444 0.969444

124 rows × 5 columns

In [93]:
#The predicted values are one period ahead. Adjusting the time.
data_new= data[['y_pred', 'y+1_unscaled']]
data_new['Time']=pd.period_range('1990-04', periods=124, freq='Q')
data_new=data_new.set_index('Time')
data_new.index =data_new.index.to_timestamp()
data_new = data_new.iloc[:-1]
data_new
Out[93]:
y_pred y+1_unscaled
Time
1990-04-01 121.513329 106.9
1990-07-01 124.689682 105.4
1990-10-01 123.764587 111.5
1991-01-01 127.571892 107.8
1991-04-01 125.248199 111.1
... ... ...
2019-10-01 263.861633 281.7
2020-01-01 292.402191 261.8
2020-04-01 266.602539 253.6
2020-07-01 256.499420 254.2
2020-10-01 257.228210 276.2

123 rows × 2 columns

In [94]:
plt.figure(figsize=(15, 7.5))
plt.plot(data_new.y_pred, color='r', label='model')
plt.axvspan(data_new.index[-7], data_new.index[-1], alpha=0.5, color='lightgrey')
plt.plot(data_new['y+1_unscaled'], label='actual')
plt.title('Consumption')
plt.ylabel('Bill. DKK')
plt.xlabel('Time')
plt.legend()
plt.show()
In [95]:
predicted_train_consumption_LSTM = data_new.y_pred[:117]
actual_train_consumption_LSTM = data_new['y+1_unscaled'][:117]

forecasted_test_consumption_LSTM = data_new.y_pred.iloc[-6:]
actual_test_consumption_LSTM = data_new['y+1_unscaled'].iloc[-6:]
In [96]:
#RMSEs
trainScore = math.sqrt(mean_squared_error(actual_train_consumption_LSTM, data_new.y_pred[:117]))
print('Train Score: %.2f RMSE' % (trainScore))

testScore = math.sqrt(mean_squared_error(actual_test_consumption_LSTM, forecasted_test_consumption_LSTM))
print('Test Score: %.2f RMSE' % (testScore))
Train Score: 10.92 RMSE
Test Score: 18.52 RMSE

A model for Imports

In [97]:
data = imports
In [98]:
#Creating a y variable, y+1
data['y+1_unscaled'] = data.y.shift(-1, fill_value=data.y.iloc[-1]) #shift 1 and fill the last (empty) row with previous value

#Normalizing
scaler = MinMaxScaler(feature_range=(0, 1)) #Scaling between 0 and 1
data['y_scaled'] = scaler.fit_transform(data.y.values.reshape(-1, 1))

#Creating a scaled y variable
data['y+1'] = data.y_scaled.shift(-1, fill_value=data.y_scaled.iloc[-1]) #shift 1 and fill the last (empty) row with previous value
In [99]:
# get the data as matrix
data_p = data.iloc[:,2:].values.astype('float32') #Only using the last to columns for modelling.

#Splitting into train and test
train, test = data_p[:118], data_p[118:]
print(len(train), len(test))

#Defining x and y
X_train = train[:,0]
y_train = train[:,1]

X_test = test[:,0]
y_test = test[:,1]

# reshape input to be [samples, time steps, features]
X_train = numpy.reshape(X_train, (X_train.shape[0], 1, 1))
X_test = numpy.reshape(X_test, (X_test.shape[0], 1, 1))
118 6
In [100]:
model = Sequential()
model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) #1 timestep and 1 feature
model.add(Dropout(0.1))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(64))
model.add(Dropout(0.1))
model.add(Dense(1))

model.compile( loss='mean_squared_error', optimizer='adam')

# fit network
history = model.fit( X_train, y_train, epochs=30, batch_size=16, validation_split=0.1, verbose=0, shuffle=False)
In [101]:
# make predictions
trainPredict = model.predict(X_train) #Predict y from x
testPredict = model.predict(X_test)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
y_train = scaler.inverse_transform([y_train])

testPredict = scaler.inverse_transform(testPredict)
y_test = scaler.inverse_transform([y_test])

#Creating a variable "Consumption_pred" for all the predicted values for both train and test
data.insert(2, "y_pred", np.zeros((124,1)), False) #Inserting empty column with zeros to hold the predicted values
data['y_pred'].iloc[-6:] = testPredict.flatten() #Filling the last 6 rows with the predicted test values
data['y_pred'].iloc[:118] =trainPredict.flatten()

#The predicted values are one period ahead. Adjusting the time.
data_new= data[['y_pred', 'y+1_unscaled']]
data_new['Time']=pd.period_range('1990-04', periods=124, freq='Q')
data_new=data_new.set_index('Time')
data_new.index =data_new.index.to_timestamp()
data_new = data_new.iloc[:-1]
data_new

plt.figure(figsize=(15, 7.5))
plt.plot(data_new.y_pred, color='r', label='model')
plt.axvspan(data_new.index[-7], data_new.index[-1], alpha=0.5, color='lightgrey')
plt.plot(data_new['y+1_unscaled'], label='actual')
plt.title('Imports')
plt.ylabel('Bill. DKK')
plt.xlabel('Time')
plt.legend()
plt.show()
In [102]:
predicted_train_imports_LSTM = data_new.y_pred[:117]
actual_train_imports_LSTM = data_new['y+1_unscaled'][:117]

forecasted_test_imports_LSTM = data_new.y_pred.iloc[-6:]
actual_test_imports_LSTM = data_new['y+1_unscaled'].iloc[-6:]
In [103]:
#RMSEs
trainScore = math.sqrt(mean_squared_error(actual_train_imports_LSTM, data_new.y_pred[:117]))
print('Train Score: %.2f RMSE' % (trainScore))

testScore = math.sqrt(mean_squared_error(actual_test_imports_LSTM, forecasted_test_imports_LSTM))
print('Test Score: %.2f RMSE' % (testScore))
Train Score: 10.36 RMSE
Test Score: 31.85 RMSE

A model for Exports

In [104]:
data = exports
In [105]:
#Creating a y variable, y+1
data['y+1_unscaled'] = data.y.shift(-1, fill_value=data.y.iloc[-1]) #shift 1 and fill the last (empty) row with previous value

#Normalizing
scaler = MinMaxScaler(feature_range=(0, 1)) #Scaling between 0 and 1
data['y_scaled'] = scaler.fit_transform(data.y.values.reshape(-1, 1))

#Creating a scaled y variable
data['y+1'] = data.y_scaled.shift(-1, fill_value=data.y_scaled.iloc[-1]) #shift 1 and fill the last (empty) row with previous value
In [106]:
# get the data as matrix
data_p = data.iloc[:,2:].values.astype('float32') #Only using the last to columns for modelling.

#Splitting into train and test
train, test = data_p[:118], data_p[118:]
print(len(train), len(test))

#Defining x and y
X_train = train[:,0]
y_train = train[:,1]

X_test = test[:,0]
y_test = test[:,1]

# reshape input to be [samples, time steps, features]
X_train = numpy.reshape(X_train, (X_train.shape[0], 1, 1))
X_test = numpy.reshape(X_test, (X_test.shape[0], 1, 1))
118 6
In [107]:
model = Sequential()
model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) #1 timestep and 1 feature
model.add(Dropout(0.1))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(64))
model.add(Dropout(0.1))
model.add(Dense(1))

model.compile( loss='mean_squared_error', optimizer='adam')

# fit network
history = model.fit( X_train, y_train, epochs=30, batch_size=16, validation_split=0.1, verbose=0, shuffle=False)
In [108]:
# make predictions
trainPredict = model.predict(X_train) #Predict y from x
testPredict = model.predict(X_test)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
y_train = scaler.inverse_transform([y_train])

testPredict = scaler.inverse_transform(testPredict)
y_test = scaler.inverse_transform([y_test])

#Creating a variable "Consumption_pred" for all the predicted values for both train and test
data.insert(2, "y_pred", np.zeros((124,1)), False) #Inserting empty column with zeros to hold the predicted values
data['y_pred'].iloc[-6:] = testPredict.flatten() #Filling the last 6 rows with the predicted test values
data['y_pred'].iloc[:118] =trainPredict.flatten()

#The predicted values are one period ahead. Adjusting the time.
data_new= data[['y_pred', 'y+1_unscaled']]
data_new['Time']=pd.period_range('1990-04', periods=124, freq='Q')
data_new=data_new.set_index('Time')
data_new.index =data_new.index.to_timestamp()
data_new = data_new.iloc[:-1]
data_new

plt.figure(figsize=(15, 7.5))
plt.plot(data_new.y_pred, color='r', label='model')
plt.axvspan(data_new.index[-7], data_new.index[-1], alpha=0.5, color='lightgrey')
plt.plot(data_new['y+1_unscaled'], label='actual')
plt.title('Exports')
plt.ylabel('Bill. DKK')
plt.xlabel('Time')
plt.legend()
plt.show()
In [109]:
predicted_train_exports_LSTM = data_new.y_pred[:117]
actual_train_exports_LSTM = data_new['y+1_unscaled'][:117]

forecasted_test_exports_LSTM = data_new.y_pred.iloc[-6:]
actual_test_exports_LSTM = data_new['y+1_unscaled'].iloc[-6:]

#RMSEs
trainScore = math.sqrt(mean_squared_error(actual_train_exports_LSTM, data_new.y_pred[:117]))
print('Train Score: %.2f RMSE' % (trainScore))

testScore = math.sqrt(mean_squared_error(actual_test_exports_LSTM, forecasted_test_exports_LSTM))
print('Test Score: %.2f RMSE' % (testScore))
Train Score: 12.18 RMSE
Test Score: 35.61 RMSE

A model for Investments

In [110]:
data = investments
In [111]:
#Creating a y variable, y+1
data['y+1_unscaled'] = data.y.shift(-1, fill_value=data.y.iloc[-1]) #shift 1 and fill the last (empty) row with previous value

#Normalizing
scaler = MinMaxScaler(feature_range=(0, 1)) #Scaling between 0 and 1
data['y_scaled'] = scaler.fit_transform(data.y.values.reshape(-1, 1))

#Creating a scaled y variable
data['y+1'] = data.y_scaled.shift(-1, fill_value=data.y_scaled.iloc[-1]) #shift 1 and fill the last (empty) row with previous value
In [112]:
# get the data as matrix
data_p = data.iloc[:,2:].values.astype('float32') #Only using the last to columns for modelling.

#Splitting into train and test
train, test = data_p[:118], data_p[118:]
print(len(train), len(test))

#Defining x and y
X_train = train[:,0]
y_train = train[:,1]

X_test = test[:,0]
y_test = test[:,1]

# reshape input to be [samples, time steps, features]
X_train = numpy.reshape(X_train, (X_train.shape[0], 1, 1))
X_test = numpy.reshape(X_test, (X_test.shape[0], 1, 1))
118 6
In [113]:
model = Sequential()
model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) #1 timestep and 1 feature
model.add(Dropout(0.1))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(64))
model.add(Dropout(0.1))
model.add(Dense(1))

model.compile( loss='mean_squared_error', optimizer='adam')

# fit network
history = model.fit( X_train, y_train, epochs=30, batch_size=16, validation_split=0.1, verbose=0, shuffle=False)
In [114]:
# make predictions
trainPredict = model.predict(X_train) #Predict y from x
testPredict = model.predict(X_test)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
y_train = scaler.inverse_transform([y_train])

testPredict = scaler.inverse_transform(testPredict)
y_test = scaler.inverse_transform([y_test])

#Creating a variable "y_pred" for all the predicted values for both train and test
data.insert(2, "y_pred", np.zeros((124,1)), False) #Inserting empty column with zeros to hold the predicted values
data['y_pred'].iloc[-6:] = testPredict.flatten() #Filling the last 6 rows with the predicted test values
data['y_pred'].iloc[:118] =trainPredict.flatten()

#The predicted values are one period ahead. Adjusting the time.
data_new= data[['y_pred', 'y+1_unscaled']]
data_new['Time']=pd.period_range('1990-04', periods=124, freq='Q')
data_new=data_new.set_index('Time')
data_new.index =data_new.index.to_timestamp()
data_new = data_new.iloc[:-1]
data_new

plt.figure(figsize=(15, 7.5))
plt.plot(data_new.y_pred, color='r', label='model')
plt.axvspan(data_new.index[-7], data_new.index[-1], alpha=0.5, color='lightgrey')
plt.plot(data_new['y+1_unscaled'], label='actual')
plt.title('Investments')
plt.ylabel('Bill. DKK')
plt.xlabel('Time')
plt.legend()
plt.show()
In [115]:
predicted_train_investments_LSTM = data_new.y_pred[:117]
actual_train_investments_LSTM = data_new['y+1_unscaled'][:117]

forecasted_test_investments_LSTM = data_new.y_pred.iloc[-6:]
actual_test_investments_LSTM = data_new['y+1_unscaled'].iloc[-6:]

#RMSEs
trainScore = math.sqrt(mean_squared_error(actual_train_investments_LSTM, data_new.y_pred[:117]))
print('Train Score: %.2f RMSE' % (trainScore))

testScore = math.sqrt(mean_squared_error(actual_test_investments_LSTM, forecasted_test_investments_LSTM))
print('Test Score: %.2f RMSE' % (testScore))
Train Score: 8.16 RMSE
Test Score: 7.72 RMSE

A model for Government Expenditures

In [116]:
data = government_expenditure
In [117]:
#Creating a y variable, y+1
data['y+1_unscaled'] = data.y.shift(-1, fill_value=data.y.iloc[-1]) #shift 1 and fill the last (empty) row with previous value

#Normalizing
scaler = MinMaxScaler(feature_range=(0, 1)) #Scaling between 0 and 1
data['y_scaled'] = scaler.fit_transform(data.y.values.reshape(-1, 1))

#Creating a scaled y variable
data['y+1'] = data.y_scaled.shift(-1, fill_value=data.y_scaled.iloc[-1]) #shift 1 and fill the last (empty) row with previous value
In [118]:
# get the data as matrix
data_p = data.iloc[:,2:].values.astype('float32') #Only using the last to columns for modelling.

#Splitting into train and test
train, test = data_p[:118], data_p[118:]
print(len(train), len(test))

#Defining x and y
X_train = train[:,0]
y_train = train[:,1]

X_test = test[:,0]
y_test = test[:,1]

# reshape input to be [samples, time steps, features]
X_train = numpy.reshape(X_train, (X_train.shape[0], 1, 1))
X_test = numpy.reshape(X_test, (X_test.shape[0], 1, 1))
118 6
In [119]:
model = Sequential()
model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) #1 timestep and 1 feature
model.add(Dropout(0.1))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(64))
model.add(Dropout(0.1))
model.add(Dense(1))

model.compile( loss='mean_squared_error', optimizer='adam')

# fit network
history = model.fit( X_train, y_train, epochs=30, batch_size=16, validation_split=0.1, verbose=0, shuffle=False)
In [120]:
# make predictions
trainPredict = model.predict(X_train) #Predict y from x
testPredict = model.predict(X_test)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
y_train = scaler.inverse_transform([y_train])

testPredict = scaler.inverse_transform(testPredict)
y_test = scaler.inverse_transform([y_test])

#Creating a variable "y_pred" for all the predicted values for both train and test
data.insert(2, "y_pred", np.zeros((124,1)), False) #Inserting empty column with zeros to hold the predicted values
data['y_pred'].iloc[-6:] = testPredict.flatten() #Filling the last 6 rows with the predicted test values
data['y_pred'].iloc[:118] =trainPredict.flatten()

#The predicted values are one period ahead. Adjusting the time.
data_new= data[['y_pred', 'y+1_unscaled']]
data_new['Time']=pd.period_range('1990-04', periods=124, freq='Q')
data_new=data_new.set_index('Time')
data_new.index =data_new.index.to_timestamp()
data_new = data_new.iloc[:-1]
data_new

plt.figure(figsize=(15, 7.5))
plt.plot(data_new.y_pred, color='r', label='model')
plt.axvspan(data_new.index[-7], data_new.index[-1], alpha=0.5, color='lightgrey')
plt.plot(data_new['y+1_unscaled'], label='actual')
plt.title('Government Expenditures')
plt.ylabel('Bill. DKK')
plt.xlabel('Time')
plt.legend()
plt.show()
In [121]:
predicted_train_gov_LSTM = data_new.y_pred[:117]
actual_train_gov_LSTM = data_new['y+1_unscaled'][:117]

forecasted_test_gov_LSTM = data_new.y_pred.iloc[-6:]
actual_test_gov_LSTM = data_new['y+1_unscaled'].iloc[-6:]

#RMSEs
trainScore = math.sqrt(mean_squared_error(actual_train_gov_LSTM, data_new.y_pred[:117]))
print('Train Score: %.2f RMSE' % (trainScore))

testScore = math.sqrt(mean_squared_error(actual_test_gov_LSTM, forecasted_test_gov_LSTM))
print('Test Score: %.2f RMSE' % (testScore))
Train Score: 4.66 RMSE
Test Score: 8.11 RMSE

A model for GDP

In [122]:
consumption = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Privatforbrug%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','consumption'])
consumption['Time']=pd.period_range('1990-01', periods=124, freq='Q')
consumption['consumption'] = pd.to_numeric(consumption['consumption'])
consumption=consumption.set_index('Time')
consumption.index = consumption.index.to_timestamp()

exports = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Eksport%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','exports'])
exports['Time']=pd.period_range('1990-01', periods=124, freq='Q')
exports['exports'] = pd.to_numeric(exports['exports'])
exports=exports.set_index('Time')
exports.index = exports.index.to_timestamp()

imports = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Import%201990K1-2020K4.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','imports'])
imports['Time']=pd.period_range('1990-01', periods=124, freq='Q')
imports['imports'] = pd.to_numeric(imports['imports'])
imports=imports.set_index('Time')
imports.index = imports.index.to_timestamp()

investments = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Investment1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','investments'])
investments['Time']=pd.period_range('1990-01', periods=124, freq='Q')
investments['investments'] = pd.to_numeric(investments['investments'])
investments=investments.set_index('Time')
investments.index = investments.index.to_timestamp()

government_expenditure = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/Public%20consumption1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','government_expenditure'])
government_expenditure['Time']=pd.period_range('1990-01', periods=124, freq='Q')
government_expenditure['government_expenditure'] = pd.to_numeric(government_expenditure['government_expenditure'])
government_expenditure=government_expenditure.set_index('Time')
government_expenditure.index = government_expenditure.index.to_timestamp()

gdp = pd.read_csv('https://raw.githubusercontent.com/Louise198713/Speciale/main/GDP1990-2020.csv',
                 sep=';',
                 header=0,
                 index_col=0,
                 parse_dates=[0],
                 names=['Time','gdp'])
gdp['Time']=pd.period_range('1990-01', periods=124, freq='Q')
gdp['gdp'] = pd.to_numeric(gdp['gdp'])
gdp=gdp.set_index('Time')
gdp.index = gdp.index.to_timestamp()
In [123]:
data = pd.merge(gdp, consumption, on='Time')
data = pd.merge(data, imports, on='Time')
data = pd.merge(data, exports, on='Time')
data = pd.merge(data, investments, on='Time')
data = pd.merge(data, government_expenditure, on='Time')
data
Out[123]:
gdp consumption imports exports investments government_expenditure
Time
1990-01-01 210.2 101.7 63.8 74.1 48.0 50.2
1990-04-01 218.1 106.9 66.7 77.8 49.2 50.9
1990-07-01 209.5 105.4 64.0 76.0 41.0 51.0
1990-10-01 217.8 111.5 69.6 83.7 39.7 52.4
1991-01-01 220.2 107.8 65.2 77.2 47.2 53.4
... ... ... ... ... ... ...
2019-10-01 603.2 281.7 306.9 348.9 132.9 146.6
2020-01-01 576.2 261.8 294.7 332.4 139.3 139.5
2020-04-01 558.6 253.6 258.9 293.3 130.8 141.4
2020-07-01 572.7 254.2 271.9 317.6 136.8 139.4
2020-10-01 596.9 276.2 286.5 319.6 131.6 155.3

124 rows × 6 columns

In [124]:
#Creating a y variable
data['gdp_y'] = data['gdp'].shift(-1, fill_value=data['gdp'].iloc[-1])

#Splitting into test and train
train_df, test_df = data[:118], data[118:]

#Defining x and y 
X_train = train_df.iloc[:,:-1].values
y_train = train_df.iloc[:,-1].values

X_test = test_df.iloc[:,:-1].values
y_test = test_df.iloc[:,-1].values
In [125]:
data
Out[125]:
gdp consumption imports exports investments government_expenditure gdp_y
Time
1990-01-01 210.2 101.7 63.8 74.1 48.0 50.2 218.1
1990-04-01 218.1 106.9 66.7 77.8 49.2 50.9 209.5
1990-07-01 209.5 105.4 64.0 76.0 41.0 51.0 217.8
1990-10-01 217.8 111.5 69.6 83.7 39.7 52.4 220.2
1991-01-01 220.2 107.8 65.2 77.2 47.2 53.4 226.1
... ... ... ... ... ... ... ...
2019-10-01 603.2 281.7 306.9 348.9 132.9 146.6 576.2
2020-01-01 576.2 261.8 294.7 332.4 139.3 139.5 558.6
2020-04-01 558.6 253.6 258.9 293.3 130.8 141.4 572.7
2020-07-01 572.7 254.2 271.9 317.6 136.8 139.4 596.9
2020-10-01 596.9 276.2 286.5 319.6 131.6 155.3 596.9

124 rows × 7 columns

In [126]:
x_scaler = MinMaxScaler(feature_range=(-1, 1))
y_scaler = MinMaxScaler(feature_range=(-1, 1))

X_train = x_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train.reshape(-1,1))

X_test = x_scaler.transform(X_test)
y_test = y_scaler.transform(y_test.reshape(-1,1))
In [127]:
# reshape input to be [samples, time steps, features]
X_train = numpy.reshape(X_train, (118, 1, 6))
X_test = numpy.reshape(X_test, (6, 1, 6))
In [128]:
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# fit network
history = model.fit(X_train, y_train, epochs=40, batch_size=32, validation_split=0.1, verbose=0, shuffle=False)
In [129]:
# make predictions
trainPredict = model.predict(X_train) #Predict y from x
testPredict = model.predict(X_test)
In [130]:
# invert predictions
trainPredict = y_scaler.inverse_transform(trainPredict)
y_train = y_scaler.inverse_transform(y_train)

testPredict = y_scaler.inverse_transform(testPredict)
y_test = y_scaler.inverse_transform(y_test)
In [131]:
#Creating a variable "gdp_pred" for all the predicted values for both train and test
data.insert(6, "gdp_pred", np.zeros((124,1)), False) #Inserting empty column with zeros to hold the predicted values
data['gdp_pred'].iloc[-6:] = testPredict.flatten() #Filling the last 6 rows with the predicted test values
data['gdp_pred'].iloc[:118] =trainPredict.flatten()
data
Out[131]:
gdp consumption imports exports investments government_expenditure gdp_pred gdp_y
Time
1990-01-01 210.2 101.7 63.8 74.1 48.0 50.2 222.869629 218.1
1990-04-01 218.1 106.9 66.7 77.8 49.2 50.9 228.668381 209.5
1990-07-01 209.5 105.4 64.0 76.0 41.0 51.0 221.740051 217.8
1990-10-01 217.8 111.5 69.6 83.7 39.7 52.4 228.727676 220.2
1991-01-01 220.2 107.8 65.2 77.2 47.2 53.4 229.415848 226.1
... ... ... ... ... ... ... ... ...
2019-10-01 603.2 281.7 306.9 348.9 132.9 146.6 580.036133 576.2
2020-01-01 576.2 261.8 294.7 332.4 139.3 139.5 562.234680 558.6
2020-04-01 558.6 253.6 258.9 293.3 130.8 141.4 541.484680 572.7
2020-07-01 572.7 254.2 271.9 317.6 136.8 139.4 550.938599 596.9
2020-10-01 596.9 276.2 286.5 319.6 131.6 155.3 572.775024 596.9

124 rows × 8 columns

In [132]:
data_new= data[['gdp_pred', 'gdp_y']]
data_new['Time']=pd.period_range('1990-04', periods=124, freq='Q')
data_new=data_new.set_index('Time')
data_new.index =data_new.index.to_timestamp()
data_new = data_new.iloc[:-1]
data_new
Out[132]:
gdp_pred gdp_y
Time
1990-04-01 222.869629 218.1
1990-07-01 228.668381 209.5
1990-10-01 221.740051 217.8
1991-01-01 228.727676 220.2
1991-04-01 229.415848 226.1
... ... ...
2019-10-01 563.111084 603.2
2020-01-01 580.036133 576.2
2020-04-01 562.234680 558.6
2020-07-01 541.484680 572.7
2020-10-01 550.938599 596.9

123 rows × 2 columns

In [133]:
plt.figure(figsize=(15, 7.5))
plt.plot(data_new.gdp_pred, color='r', label='model')
plt.axvspan(data_new.index[-7], data_new.index[-1], alpha=0.5, color='lightgrey')
plt.plot(data_new['gdp_y'], label='actual')
plt.title('GDP')
plt.ylabel('Bill. DKK')
plt.xlabel('Time')
plt.legend()
plt.show()
In [134]:
predicted_train_gdp_LSTM = data_new.gdp_pred[:117]
actual_train_gdp_LSTM = data_new['gdp_y'][:117]

forecasted_test_gdp_LSTM = data_new.gdp_pred.iloc[-6:]
actual_test_gdp_LSTM = data_new['gdp_y'].iloc[-6:]

#RMSEs
trainScore = math.sqrt(mean_squared_error(actual_train_gdp_LSTM, predicted_train_gdp_LSTM))
print('Train Score: %.2f RMSE' % (trainScore))

testScore = math.sqrt(mean_squared_error(actual_test_gdp_LSTM, forecasted_test_gdp_LSTM))
print('Test Score: %.2f RMSE' % (testScore))
Train Score: 16.03 RMSE
Test Score: 28.79 RMSE

Evaluating forecasts

Diebold-Mariano test

In [135]:
def diebold_mariano_test(actual_lst, pred1_lst, pred2_lst, h = 1, crit="MSE", power = 2):
    # Routine for checking errors
    def error_check():
        rt = 0
        msg = ""
        # Check if h is an integer
        if (not isinstance(h, int)):
            rt = -1
            msg = "The type of the number of steps ahead (h) is not an integer."
            return (rt,msg)
        # Check the range of h
        if (h < 1):
            rt = -1
            msg = "The number of steps ahead (h) is not large enough."
            return (rt,msg)
        len_act = len(actual_lst)
        len_p1  = len(pred1_lst)
        len_p2  = len(pred2_lst)
        # Check if lengths of actual values and predicted values are equal
        if (len_act != len_p1 or len_p1 != len_p2 or len_act != len_p2):
            rt = -1
            msg = "Lengths of actual_lst, pred1_lst and pred2_lst do not match."
            return (rt,msg)
        # Check range of h
        if (h >= len_act):
            rt = -1
            msg = "The number of steps ahead is too large."
            return (rt,msg)
        # Check if criterion supported
        if (crit != "MSE" and crit != "MAPE" and crit != "MAD" and crit != "poly"):
            rt = -1
            msg = "The criterion is not supported."
            return (rt,msg)  
        # Check if every value of the input lists are numerical values
        from re import compile as re_compile
        comp = re_compile("^\d+?\.\d+?$")  
        def compiled_regex(s):
            """ Returns True is string is a number. """
            if comp.match(s) is None:
                return s.isdigit()
            return True
        for actual, pred1, pred2 in zip(actual_lst, pred1_lst, pred2_lst):
            is_actual_ok = compiled_regex(str(abs(actual)))
            is_pred1_ok = compiled_regex(str(abs(pred1)))
            is_pred2_ok = compiled_regex(str(abs(pred2)))
            if (not (is_actual_ok and is_pred1_ok and is_pred2_ok)):  
                msg = "An element in the actual_lst, pred1_lst or pred2_lst is not numeric."
                rt = -1
                return (rt,msg)
        return (rt,msg)
    
    # Error check
    error_code = error_check()
    # Raise error if cannot pass error check
    if (error_code[0] == -1):
        raise SyntaxError(error_code[1])
        return
    # Import libraries
    from scipy.stats import t
    import collections
    import pandas as pd
    import numpy as np
    
    # Initialise lists
    e1_lst = []
    e2_lst = []
    d_lst  = []
    
    # convert every value of the lists into real values
    actual_lst = pd.Series(actual_lst).apply(lambda x: float(x)).tolist()
    pred1_lst = pd.Series(pred1_lst).apply(lambda x: float(x)).tolist()
    pred2_lst = pd.Series(pred2_lst).apply(lambda x: float(x)).tolist()
    
    # Length of lists (as real numbers)
    T = float(len(actual_lst))
    
    # construct d according to crit
    if (crit == "MSE"):
        for actual,p1,p2 in zip(actual_lst,pred1_lst,pred2_lst):
            e1_lst.append((actual - p1)**2)
            e2_lst.append((actual - p2)**2)
        for e1, e2 in zip(e1_lst, e2_lst):
            d_lst.append(e1 - e2)
    elif (crit == "MAD"):
        for actual,p1,p2 in zip(actual_lst,pred1_lst,pred2_lst):
            e1_lst.append(abs(actual - p1))
            e2_lst.append(abs(actual - p2))
        for e1, e2 in zip(e1_lst, e2_lst):
            d_lst.append(e1 - e2)
    elif (crit == "MAPE"):
        for actual,p1,p2 in zip(actual_lst,pred1_lst,pred2_lst):
            e1_lst.append(abs((actual - p1)/actual))
            e2_lst.append(abs((actual - p2)/actual))
        for e1, e2 in zip(e1_lst, e2_lst):
            d_lst.append(e1 - e2)
    elif (crit == "poly"):
        for actual,p1,p2 in zip(actual_lst,pred1_lst,pred2_lst):
            e1_lst.append(((actual - p1))**(power))
            e2_lst.append(((actual - p2))**(power))
        for e1, e2 in zip(e1_lst, e2_lst):
            d_lst.append(e1 - e2)    
    
    # Mean of d        
    mean_d = pd.Series(d_lst).mean()
    
    # Find autocovariance and construct DM test statistics
    def autocovariance(Xi, N, k, Xs):
        autoCov = 0
        T = float(N)
        for i in np.arange(0, N-k):
              autoCov += ((Xi[i+k])-Xs)*(Xi[i]-Xs)
        return (1/(T))*autoCov
    gamma = []
    for lag in range(0,h):
        gamma.append(autocovariance(d_lst,len(d_lst),lag,mean_d)) # 0, 1, 2
    V_d = (gamma[0] + 2*sum(gamma[1:]))/T
    DM_stat=V_d**(-0.5)*mean_d
    harvey_adj=((T+1-2*h+h*(h-1)/T)/T)**(0.5)
    DM_stat = harvey_adj*DM_stat
    # Find p-value
    p_value = 2*t.cdf(-abs(DM_stat), df = T - 1)
    
    # Construct named tuple for return
    dm_return = collections.namedtuple('dm_return', 'DM p_value')
    
    rt = dm_return(DM = DM_stat, p_value = p_value)
    
    return rt

DM test for Consumption

In [136]:
dm_consumption_base_ARIMA = diebold_mariano_test(consumption.consumption[-6:], forecast_values_consumption_baseline.squeeze(), forecast_values_consumption_ARIMA)
dm_consumption_base_LSTM = diebold_mariano_test(consumption.consumption[-6:], forecast_values_consumption_baseline.squeeze(), forecasted_test_consumption_LSTM)
dm_consumption_ARIMA_LSTM = diebold_mariano_test(consumption.consumption[-6:], forecast_values_consumption_ARIMA.squeeze(), forecasted_test_consumption_LSTM)
print(f'Baseline and ARIMA:')
print(dm_consumption_base_ARIMA)
print(f'Baseline and LSTM:')
print(dm_consumption_base_LSTM)
print(f'ARIMA and LSTM:')
print(dm_consumption_ARIMA_LSTM)
Baseline and ARIMA:
dm_return(DM=-1.3515603156613512, p_value=0.2344429337465337)
Baseline and LSTM:
dm_return(DM=-1.856064290728856, p_value=0.12258933322799889)
ARIMA and LSTM:
dm_return(DM=-1.2069547967829637, p_value=0.28142949023172803)

DM test for Imports

In [137]:
dm_consumption_base_ARIMA = diebold_mariano_test(imports.imports[-6:], forecast_values_imports_baseline.squeeze(), forecast_values_imports_ARIMA)
dm_consumption_base_LSTM = diebold_mariano_test(imports.imports[-6:], forecast_values_imports_baseline.squeeze(), forecasted_test_imports_LSTM)
dm_consumption_ARIMA_LSTM = diebold_mariano_test(imports.imports[-6:], forecast_values_imports_ARIMA.squeeze(), forecasted_test_imports_LSTM)
print(f'Baseline and ARIMA:')
print(dm_consumption_base_ARIMA)
print(f'Baseline and LSTM:')
print(dm_consumption_base_LSTM)
print(f'ARIMA and LSTM:')
print(dm_consumption_ARIMA_LSTM)
Baseline and ARIMA:
dm_return(DM=-0.5878616823450776, p_value=0.582171311137412)
Baseline and LSTM:
dm_return(DM=-1.0850961615611092, p_value=0.32740376435937163)
ARIMA and LSTM:
dm_return(DM=-1.055565750203627, p_value=0.33947790666126154)

DM test for Exports

In [138]:
dm_consumption_base_ARIMA = diebold_mariano_test(exports.exports[-6:], forecast_values_exports_baseline.squeeze(), forecast_values_exports_ARIMA)
dm_consumption_base_LSTM = diebold_mariano_test(exports.exports[-6:], forecast_values_exports_baseline.squeeze(), forecasted_test_exports_LSTM)
dm_consumption_ARIMA_LSTM = diebold_mariano_test(exports.exports[-6:], forecast_values_exports_ARIMA.squeeze(), forecasted_test_exports_LSTM)
print(f'Baseline and ARIMA:')
print(dm_consumption_base_ARIMA)
print(f'Baseline and LSTM:')
print(dm_consumption_base_LSTM)
print(f'ARIMA and LSTM:')
print(dm_consumption_ARIMA_LSTM)
Baseline and ARIMA:
dm_return(DM=-1.2035120222048556, p_value=0.2826456227222946)
Baseline and LSTM:
dm_return(DM=-0.9009832476296876, p_value=0.40889563533039497)
ARIMA and LSTM:
dm_return(DM=-0.1346835319034181, p_value=0.8981158212484757)

DM test for Investments

In [139]:
dm_consumption_base_ARIMA = diebold_mariano_test(investments.investments[-6:], forecast_values_investments_baseline.squeeze(), forecast_values_investments_ARIMA)
dm_consumption_base_LSTM = diebold_mariano_test(investments.investments[-6:], forecast_values_investments_baseline.squeeze(), forecasted_test_investments_LSTM)
dm_consumption_ARIMA_LSTM = diebold_mariano_test(investments.investments[-6:], forecast_values_investments_ARIMA.squeeze(), forecasted_test_investments_LSTM)
print(f'Baseline and ARIMA:')
print(dm_consumption_base_ARIMA)
print(f'Baseline and LSTM:')
print(dm_consumption_base_LSTM)
print(f'ARIMA and LSTM:')
print(dm_consumption_ARIMA_LSTM)
Baseline and ARIMA:
dm_return(DM=-0.6013335243623688, p_value=0.5738388284547491)
Baseline and LSTM:
dm_return(DM=-0.7703016978930859, p_value=0.47592204434462204)
ARIMA and LSTM:
dm_return(DM=-0.5255686757619156, p_value=0.6216537557308306)

DM test for Government Expenditures

In [140]:
dm_consumption_base_ARIMA = diebold_mariano_test(government_expenditure.government_expenditure[-6:], forecast_values_gov_baseline.squeeze(), forecast_values_government_expenditure_ARIMA)
dm_consumption_base_LSTM = diebold_mariano_test(government_expenditure.government_expenditure[-6:], forecast_values_gov_baseline.squeeze(), forecasted_test_gov_LSTM)
dm_consumption_ARIMA_LSTM = diebold_mariano_test(government_expenditure.government_expenditure[-6:], forecast_values_government_expenditure_ARIMA.squeeze(), forecasted_test_gov_LSTM)
print(f'Baseline and ARIMA:')
print(dm_consumption_base_ARIMA)
print(f'Baseline and LSTM:')
print(dm_consumption_base_LSTM)
print(f'ARIMA and LSTM:')
print(dm_consumption_ARIMA_LSTM)
Baseline and ARIMA:
dm_return(DM=0.9510741402149383, p_value=0.38524203699706905)
Baseline and LSTM:
dm_return(DM=-2.026942591793739, p_value=0.09849435647603585)
ARIMA and LSTM:
dm_return(DM=-2.205750083315772, p_value=0.07852335436478752)

DM test for GDP

In [141]:
dm_consumption_base_ARIMA = diebold_mariano_test(gdp.gdp[-6:], forecast_values_gov_baseline.squeeze(), forecast_values_gdp_ARIMA)
dm_consumption_base_LSTM = diebold_mariano_test(gdp.gdp[-6:], forecast_values_gov_baseline.squeeze(), forecasted_test_gdp_LSTM)
dm_consumption_ARIMA_LSTM = diebold_mariano_test(gdp.gdp[-6:], forecast_values_gdp_ARIMA.squeeze(), forecasted_test_gdp_LSTM)
print(f'Baseline and ARIMA:')
print(dm_consumption_base_ARIMA)
print(f'Baseline and LSTM:')
print(dm_consumption_base_LSTM)
print(f'ARIMA and LSTM:')
print(dm_consumption_ARIMA_LSTM)
Baseline and ARIMA:
dm_return(DM=38.43051887171322, p_value=2.247900896356092e-07)
Baseline and LSTM:
dm_return(DM=40.382252782722325, p_value=1.7559087731980415e-07)
ARIMA and LSTM:
dm_return(DM=-1.8014848974054003, p_value=0.13150595113394006)
In [141]:
%%shell
jupyter nbconvert --to html /content/MasterThesis.ipynb